update hw info

enhance FA stable in UT
2026-05-28 17:27:26 +03:00 · 2026-03-31 09:24:40 +08:00 · 2026-03-17 15:57:02 +08:00
1813 changed files with 118701 additions and 235119 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -4,10 +4,7 @@

 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc2-${CHIP_TYPE}-openeuler24.03-py3.11

 # ==============================================================================
 # BUILD STAGE
@@ -58,7 +55,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full && \
    cp build/bin/* /app/full/ && \
    cp *.py /app/full/ && \
-    cp -r conversion /app/full/ && \
    cp -r gguf-py /app/full/ && \
    cp -r requirements /app/full/ && \
    cp requirements.txt /app/full/
@@ -71,19 +67,6 @@ RUN mkdir -p /app/full && \
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 # -- Install runtime dependencies --
 RUN yum install -y libgomp curl && \
    yum clean all && \
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,16 +1,11 @@
-ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
+ARG UBUNTU_VERSION=22.04

 FROM ubuntu:$UBUNTU_VERSION AS build

 ARG TARGETARCH

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
-
-ENV CC=gcc-14 CXX=g++-14
+    apt-get install -y build-essential git cmake libssl-dev

 WORKDIR /app

@@ -30,7 +25,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -39,21 +33,8 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -74,9 +55,8 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
-    && pip install --break-system-packages -r requirements.txt \
+    && pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/cuda-new.Dockerfile
+++ b/.devops/cuda-new.Dockerfile
@@ -0,0 +1,95 @@
+ARG UBUNTU_VERSION=24.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=13.1.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1
+
+WORKDIR /app
+
+COPY . .
+
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ${BASE_CUDA_RUN_CONTAINER} AS base
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -1,24 +1,18 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.8.1
+ARG CUDA_VERSION=12.4.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default

 RUN apt-get update && \
-    apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1
-
-ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14
+    apt-get install -y build-essential cmake python3 python3-pip git libssl-dev libgomp1

 WORKDIR /app

@@ -36,7 +30,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -45,21 +38,8 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -80,8 +60,7 @@ RUN apt-get update \
    git \
    python3 \
    python3-pip \
-    python3-wheel \
-    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --upgrade pip setuptools wheel \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,22 +1,12 @@
-ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
+ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

 ## Build Image

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
-ARG LEVEL_ZERO_VERSION=1.28.2
-ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04
 RUN apt-get update && \
-    apt-get install -y git libssl-dev wget ca-certificates && \
-    cd /tmp && \
-    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb && \
-    wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb && \
-    apt-get -o Dpkg::Options::="--force-overwrite" install -y ./level-zero.deb ./level-zero-devel.deb && \
-    rm -f /tmp/level-zero.deb /tmp/level-zero-devel.deb
+    apt-get install -y git libssl-dev

 WORKDIR /app

@@ -36,7 +26,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -44,38 +33,8 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
-ARG IGC_VERSION=v2.20.5
-ARG IGC_VERSION_FULL=2_2.20.5+19972
-ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
-ARG COMPUTE_RUNTIME_VERSION_FULL=25.40.35563.10-0
-ARG IGDGMM_VERSION=22.8.2
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-  && wget https://github.com/intel/compute-runtime/releases/download/$COMPUTE_RUNTIME_VERSION/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-  && dpkg --install *.deb
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -133,3 +92,4 @@ WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]

 ENTRYPOINT [ "/app/llama-server" ]
+
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,7 +1,4 @@
-ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
+ARG ASCEND_VERSION=8.1.RC1.alpha001-910b-openeuler22.03-py3.10

 FROM ascendai/cann:$ASCEND_VERSION AS build

@@ -31,20 +28,6 @@ RUN echo "Building with static libs" && \

 # TODO: use image with NNRT
 FROM ascendai/cann:$ASCEND_VERSION AS runtime
-
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

 ENV LC_ALL=C.utf8
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -6,10 +6,6 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V

 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@@ -41,7 +37,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -50,21 +45,8 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -16,9 +16,8 @@
  rocmPackages,
  vulkan-headers,
  vulkan-loader,
-  openssl,
+  curl,
  shaderc,
-  spirv-headers,
  useBlas ?
    builtins.all (x: !x) [
      useCuda
@@ -42,7 +41,6 @@
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
  precompileMetalShaders ? false,
-  useWebUi ? true,
 }:

 let
@@ -103,7 +101,6 @@ let
    vulkan-headers
    vulkan-loader
    shaderc
-    spirv-headers
  ];
 in

@@ -162,13 +159,11 @@ effectiveStdenv.mkDerivation (finalAttrs: {
    ++ optionals useMpi [ mpi ]
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
-    ++ optionals useVulkan vulkanBuildInputs
-    ++ [ openssl ];
+    ++ optionals useVulkan vulkanBuildInputs;

  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
-      (cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
      (cmakeBool "GGML_NATIVE" false)
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -2,26 +2,10 @@ ARG OPENVINO_VERSION_MAJOR=2026.0
 ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
 ARG UBUNTU_VERSION=24.04

-# Intel GPU driver versions. https://github.com/intel/compute-runtime/releases
-ARG IGC_VERSION=v2.30.1
-ARG IGC_VERSION_FULL=2_2.30.1+20950
-ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
-ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
-ARG IGDGMM_VERSION=22.9.0
-
-# Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases
-ARG NPU_DRIVER_VERSION=v1.32.0
-ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947
-ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
-
-# Optional proxy build arguments
+# Optional proxy build arguments - empty by default
 ARG http_proxy=
 ARG https_proxy=

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 ## Build Image
 FROM ubuntu:${UBUNTU_VERSION} AS build

@@ -81,7 +65,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/ReleaseOV/bin/* /app/full/ \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -93,61 +76,15 @@ FROM ubuntu:${UBUNTU_VERSION} AS base
 # Pass proxy args to runtime stage
 ARG http_proxy
 ARG https_proxy
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
-    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
+    && apt-get install -y libgomp1 libtbb12 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete

-# Install GPU drivers
-ARG IGC_VERSION
-ARG IGC_VERSION_FULL
-ARG COMPUTE_RUNTIME_VERSION
-ARG COMPUTE_RUNTIME_VERSION_FULL
-ARG IGDGMM_VERSION
-RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \
-    && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/neo/
-
-# Install NPU drivers
-ARG NPU_DRIVER_VERSION
-ARG NPU_DRIVER_FULL
-ARG LIBZE1_VERSION
-RUN mkdir /tmp/npu/ && cd /tmp/npu/ \
-    && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \
-    && dpkg --install *.deb \
-    && rm -rf /tmp/npu/
-
-RUN cd /tmp \
-    && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \
-    && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \
-    && rm libze1_${LIBZE1_VERSION}_amd64.deb
-
 COPY --from=build /app/lib/ /app/

 ### Full (all binaries)
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,26 +1,22 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.2.1
-ARG AMDGPU_VERSION=7.2.1
+ARG ROCM_VERSION=7.2
+ARG AMDGPU_VERSION=7.2

 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # This is mostly tied to rocBLAS supported archs.
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.1/reference/system-requirements.html
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-7.2.0/reference/system-requirements.html
 # check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityrad/native_linux/native_linux_compatibility.html
 # check https://rocm.docs.amd.com/projects/radeon-ryzen/en/latest/docs/compatibility/compatibilityryz/native_linux/native_linux_compatibility.html

-ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201'
+ARG ROCM_DOCKER_ARCH='gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201'

 # Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
@@ -53,7 +49,6 @@ RUN mkdir -p /app/lib \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -62,21 +57,8 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
-    && apt-get install -y libgomp1 curl \
+    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -97,7 +79,7 @@ RUN apt-get update \
    git \
    python3-pip \
    python3 \
-    python3-wheel \
+    python3-wheel\
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -1,8 +1,5 @@
 ARG GCC_VERSION=15.2.0
 ARG UBUNTU_VERSION=24.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
 FROM gcc:${GCC_VERSION} AS build
@@ -37,7 +34,6 @@ RUN --mount=type=cache,target=/root/.ccache \

 COPY *.py             /opt/llama.cpp/bin
 COPY .devops/tools.sh /opt/llama.cpp/bin
-COPY conversion       /opt/llama.cpp/conversion

 COPY gguf-py          /opt/llama.cpp/gguf-py
 COPY requirements.txt /opt/llama.cpp/gguf-py
@@ -48,28 +44,14 @@ COPY requirements     /opt/llama.cpp/gguf-py/requirements
 FROM scratch AS collector

 # Copy llama.cpp binaries and libraries
-COPY --from=build /opt/llama.cpp/bin        /llama.cpp/bin
-COPY --from=build /opt/llama.cpp/lib        /llama.cpp/lib
-COPY --from=build /opt/llama.cpp/gguf-py    /llama.cpp/gguf-py
-COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion
+COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py


 ### Base image
 FROM ubuntu:${UBUNTU_VERSION} AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
    apt update -y && \
@@ -109,7 +91,6 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y

 COPY --from=collector /llama.cpp/bin /app
 COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
-COPY --from=collector /llama.cpp/conversion /app/conversion

 RUN pip install --no-cache-dir --break-system-packages \
        -r /app/gguf-py/requirements.txt
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,7 +1,4 @@
 ARG UBUNTU_VERSION=26.04
-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -10,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget xz-utils

 # Install SSL and Vulkan SDK dependencies
 RUN apt install -y libssl-dev curl \
-    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc spirv-headers
+    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc

 # Build it
 WORKDIR /app
@@ -26,7 +23,6 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
-    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -35,19 +31,6 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base

-ARG BUILD_DATE=N/A
-ARG APP_VERSION=N/A
-ARG APP_REVISION=N/A
-ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
-ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
-LABEL org.opencontainers.image.created=$BUILD_DATE \
-      org.opencontainers.image.version=$APP_VERSION \
-      org.opencontainers.image.revision=$APP_REVISION \
-      org.opencontainers.image.title="llama.cpp" \
-      org.opencontainers.image.description="LLM inference in C/C++" \
-      org.opencontainers.image.url=$IMAGE_URL \
-      org.opencontainers.image.source=$IMAGE_SOURCE
-
 RUN apt-get update \
    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
@@ -66,20 +49,17 @@ COPY --from=build /app/full /app

 WORKDIR /app

-ENV PATH="/root/.venv/bin:/root/.local/bin:${PATH}"
-
-# Flag for compatibility with pip
-ARG UV_INDEX_STRATEGY="unsafe-best-match"
 RUN apt-get update \
    && apt-get install -y \
    build-essential \
-    curl \
    git \
-    ca-certificates \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && uv python install 3.13 \
-    && uv venv --python 3.13 /root/.venv \
-    && uv pip install --python /root/.venv/bin/python -r requirements.txt \
+    python3.13 \
+    python3.13-dev \
+    python3-pip \
+    python3-wheel \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.13 100 \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
--- a/.editorconfig
+++ b/.editorconfig
@@ -21,6 +21,14 @@ indent_style = tab
 [prompts/*.txt]
 insert_final_newline = unset

+[tools/server/public/*]
+indent_size = 2
+
+[tools/server/public/deps_*]
+trim_trailing_whitespace = unset
+indent_style = unset
+indent_size = unset
+
 [tools/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
@@ -45,7 +53,7 @@ insert_final_newline = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/ui/**]
+[tools/server/webui/**]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@@ -12,8 +12,6 @@ body:
        after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: commit
    attributes:
@@ -43,7 +41,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
        multiple: true
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -1,5 +1,5 @@
 name: Bug (model use)
-description: Something goes wrong when running a model (crashes, garbled outputs, etc.).
+description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
 title: "Eval bug: "
 labels: ["bug-unconfirmed", "model evaluation"]
 body:
@@ -12,8 +12,6 @@ body:
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
        The `llama-completion` binary can be used for simple and reproducible model inference.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
@@ -44,7 +42,7 @@ body:
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
-        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, OpenVINO, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
+        options: [AMX, BLAS, CANN, CPU, CUDA, Hexagon, HIP, Metal, Musa, OpenCL, RPC, SYCL, VirtGPU, Vulkan, WebGPU, zDNN, ZenDNN]
        multiple: true
    validations:
      required: true
@@ -100,8 +98,8 @@ body:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
-          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), preferably upload them as files instead.
+          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
      value: |
        <details>
        <summary>Logs</summary>
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -10,8 +10,6 @@ body:
        This issue template is intended for miscellaneous bugs that don't fit into any other category.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
-
-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
  - type: textarea
    id: version
    attributes:
@@ -88,8 +86,8 @@ body:
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
-          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead.
+          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
      value: |
        <details>
        <summary>Logs</summary>
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -8,8 +8,6 @@ body:
      value: |
        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: checkboxes
    id: prerequisites
    attributes:
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -8,8 +8,6 @@ body:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: checkboxes
    id: research-stage
    attributes:
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -9,8 +9,6 @@ body:
        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.

-        Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
-
  - type: textarea
    id: background-description
    attributes:
--- a/.github/actions/linux-setup-spacemit/action.yml
+++ b/.github/actions/linux-setup-spacemit/action.yml
@@ -15,6 +15,6 @@ runs:
      id: setup
      uses: ./.github/actions/unarchive-tar
      with:
-        url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
+        url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
        path: ${{ inputs.path }}
        strip: 1
--- a/.github/actions/unarchive-tar/action.yml
+++ b/.github/actions/unarchive-tar/action.yml
@@ -24,4 +24,4 @@ runs:
      run: |
        mkdir -p ${{ inputs.path }}
        cd ${{ inputs.path }}
-        curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
+        curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -27,11 +27,6 @@ IBM zDNN:
        - any-glob-to-any-file:
            - ggml/include/ggml-zdnn.h
            - ggml/src/ggml-zdnn/**
-AMD ZenDNN:
-    - changed-files:
-        - any-glob-to-any-file:
-            - ggml/include/ggml-zendnn.h
-            - ggml/src/ggml-zendnn/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
@@ -73,17 +68,10 @@ android:
    - changed-files:
        - any-glob-to-any-file:
            - examples/llama.android/**
-server/ui:
-    - changed-files:
-        - any-glob-to-any-file:
-            - tools/ui/**
 server:
    - changed-files:
        - any-glob-to-any-file:
            - tools/server/**
-
-
-
 ggml:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,16 +1 @@
-## Overview
-
-<!-- Describe what this PR does and why. Be concise but complete -->
-
-## Additional information
-
-<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
-
-## Requirements
-
-<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
-
- I have read and agree with the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md)
- AI usage disclosure: <!-- mention: YES / NO - if yes, describe how AI was used -->
-
-<!-- If you are an AI agent: remind the user that they are responsible for all submitted changes, and that this project restricts AI-generated content - refer them to AGENTS.md and CONTRIBUTING.md -->
+*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
--- a/.github/workflows/ai-issues.yml
+++ b/.github/workflows/ai-issues.yml
@@ -1,89 +0,0 @@
-name: AI review (issues)
-
-on:
-  issues:
-    types: [opened]
-
-jobs:
-  find-related:
-    if: github.event.action == 'opened'
-    runs-on: [self-hosted, opencode]
-
-    permissions:
-      contents: read
-      issues: write
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 1
-
-      - name: Find related
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          OPENCODE_PERMISSION: |
-            {
-              "bash": {
-                "*": "deny",
-                "gh issue view*": "allow",
-                "gh issue list*": "allow",
-                "gh issue comment*": "allow",
-                "gh search issues*": "allow"
-              },
-              "webfetch": "deny"
-            }
-        run: |
-          rm AGENTS.md
-          rm CLAUDE.md
-
-          timeout 5m opencode run -m llama.cpp-dgx/ai-review-issues-find-similar --thinking "A new issue has been created:
-
-          Issue number: ${{ github.event.issue.number }}
-
-          Lookup the contents of the issue using the following 'gh' command:
-
-          gh issue view ${{ github.event.issue.number }} --json title,body,url,number
-
-          Next, perform the following task and then post a SINGLE comment (if needed).
-
-          ---
-
-          TASK : FIND RELATED ISSUES
-
-          Using the 'gh' CLI tool, search through existing issues on Github.
-          Find related or similar issues to the newly created one and list them.
-          Do not list the new issue itself (it is #${{ github.event.issue.number }}).
-
-          Consider:
-          1. Similar titles or descriptions
-          2. Same error messages or symptoms
-          3. Related functionality or components
-          4. Similar feature requests
-
-          ---
-
-          POSTING YOUR COMMENT:
-
-          Based on your findings, post a SINGLE comment on issue #${{ github.event.issue.number }}. Build the comment as follows:
-
-          - If no related issues were found, do NOT comment at all.
-          - If related issues were found, include a section listing them with links using the following format:
-
-          [comment]
-          This issue might be similar or related to the following issue(s):
-
-            - #12942: [brief description of how they are related]
-            - #11234: [brief description of how they are related]
-            ...
-
-          _This comment was auto-generated locally using **$GA_ENGINE** on **$GA_MACHINE**_
-          [/comment]
-
-          Remember:
-            - Do not include the comment tags in your actual comment.
-            - Post at most ONE comment combining all findings.
-            - If you didn't find issues that are related enough, post nothing.
-            - You have access only to the 'gh' CLI tool - don't try to use other tools.
-            - If the output from a tool call is too long, try to limit down the search.
-          "
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -1,148 +0,0 @@
-name: CI (snapdragon)
-
-on:
-  workflow_dispatch:
-  push:
-    branches:
-      - master
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-and-test-snapdragon.yml'
-      - 'ggml/include/ggml-hexagon.h'
-      - 'ggml/src/ggml-hexagon/**'
-      - 'docs/backend/snapdragon/**'
-      - 'scripts/snapdragon/**'
-      - 'CMakePresets.json'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  android-ndk-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Android
-        id: build_llama_cpp_snapdragon_android
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-android-snapdragon-release -B build
-          cmake --build build
-          cmake --install build --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  linux-iot-snapdragon:
-    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Build Llama.CPP for Snapdragon Linux IoT
-        id: build_llama_cpp_snapdragon_linux
-        run: |
-          cp docs/backend/snapdragon/CMakeUserPresets.json .
-          cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON
-          cmake --build build-snapdragon -j $(nproc)
-          cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
-
-      - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-linux-arm64-snapdragon
-          path: pkg-snapdragon/llama.cpp
-
-  test-snapdragon-qdc:
-    name: Test on QDC Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon, linux-iot-snapdragon]
-    runs-on: ubuntu-24.04-arm
-    timeout-minutes: 90
-    strategy:
-      fail-fast: false
-      matrix:
-        device: [SM8750, SM8850, QCS9075M]
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-
-      - name: Download build artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }}
-          path: pkg-snapdragon/llama.cpp
-
-      - name: Set up Python
-        uses: actions/setup-python@v6
-        with:
-          python-version: '3.x'
-          cache: pip
-
-      - name: Install system dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y curl unzip
-
-      - name: Install QDC SDK wheel
-        run: |
-          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
-          unzip qdc_sdk.zip -d qdc_sdk
-          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
-
-      - name: Check QDC API key
-        id: check_secret
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
-
-      - name: Run QDC tests (${{ matrix.device }})
-        if: steps.check_secret.outputs.has-qdc-key == 'true'
-        run: |
-          python scripts/snapdragon/qdc/run_qdc_jobs.py \
-              --test       all \
-              --pkg-dir    pkg-snapdragon/llama.cpp \
-              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }} \
-              ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }}
-        env:
-          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
-
-      - name: Cleanup
-        if: always()
-        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,24 +1,26 @@
 name: CI (android)

 on:
-  workflow_dispatch:
+  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
-    paths:
-      - '.github/workflows/build-android.yml'
-      - '**/CMakeLists.txt'
-      - '**/.cmake'
-      - '**/*.h'
-      - '**/*.hpp'
-      - '**/*.c'
-      - '**/*.cpp'
+    paths: [
+      '.github/workflows/build-android.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]

  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      - '.github/workflows/build-android.yml'
-      - 'examples/llama.android/**'
+    paths: [
+      '.github/workflows/build-android.yml',
+      'examples/llama.android/**'
+    ]

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -38,9 +40,13 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
+
+      # Disabled due to size (400MB) and always 0 cache hits
+      # - name: ccache
+      #   uses: ggml-org/ccache-action@v1.2.16
+      #   with:
+      #     key: android-build
+      #     evict-old-files: 1d

      - name: Set up JDK
        uses: actions/setup-java@v5
@@ -49,7 +55,7 @@ jobs:
          distribution: zulu

      - name: Setup Android SDK
-        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
+        uses: android-actions/setup-android@v3
        with:
          log-accepted-android-sdk-licenses: false

@@ -60,84 +66,75 @@ jobs:

  android-ndk:
    runs-on: ubuntu-latest
-    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          lfs: false
-
-      - name: Dependencies
-        run: |
-          apt-get update
-          apt-get install -y build-essential
-
-      - name: Build
-        id: ndk_build
-        run: |
-          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
-          cmake --build build
-          cmake --install build --prefix pkg-adb/llama.cpp
-
-      - name: Upload Android Build Artifact
-        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
-        uses: actions/upload-artifact@v6
-        with:
-          name: llama-cpp-android-arm64-cpu
-          path: pkg-adb/llama.cpp
-
-  android-arm64:
-    runs-on: ubuntu-latest

    env:
-      NDK_VERSION: "29.0.14206865"
+      OPENCL_VERSION: 2025.07.22
+
+    strategy:
+      matrix:
+        include:
+          - build: 'arm64-cpu'
+            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
+          - build: 'arm64-snapdragon'
+            defines: '--preset arm64-android-snapdragon-release'

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: android-arm64
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Set up JDK
-        uses: actions/setup-java@v5
-        with:
-          java-version: 17
-          distribution: temurin
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Install NDK
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
        run: |
-          sdkmanager "ndk;${{ env.NDK_VERSION }}"
-          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+          mkdir opencl
+          curl -L -o opencl/clhpp.tar.gz      https://github.com/KhronosGroup/OpenCL-CLHPP/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/headers.tar.gz    https://github.com/KhronosGroup/OpenCL-Headers/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          curl -L -o opencl/icd-loader.tar.gz https://github.com/KhronosGroup/OpenCL-ICD-Loader/archive/refs/tags/v${OPENCL_VERSION}.tar.gz
+          tar -xaf opencl/headers.tar.gz    -C opencl
+          tar -xaf opencl/clhpp.tar.gz      -C opencl
+          tar -xaf opencl/icd-loader.tar.gz -C opencl
+          sudo cp -r opencl/OpenCL-Headers-${OPENCL_VERSION}/CL         ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+          sudo cp -r opencl/OpenCL-CLHPP-${OPENCL_VERSION}/include/CL/* ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include/CL
+          cd opencl/OpenCL-ICD-Loader-${OPENCL_VERSION}
+          cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -DOPENCL_ICD_LOADER_HEADERS_DIR=${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=31 -DANDROID_STL=c++_shared
+          cmake --build build
+          sudo cp build/libOpenCL.so ${ANDROID_NDK_ROOT}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+          rm -rf opencl
+
+      - name: Install Hexagon SDK
+        id: install_hexsdk
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        env:
+          HEXSDK_VER: 6.4.0.2
+          HEXTLS_VER: 19.0.04
+        run: |
+          curl -L -o hex-sdk.tar.gz https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v$HEXSDK_VER/hexagon-sdk-v$HEXSDK_VER-amd64-lnx.tar.xz
+          mkdir hex-sdk
+          tar -xaf hex-sdk.tar.gz -C hex-sdk
+          ls -l hex-sdk
+          sudo mv hex-sdk /opt/hexagon
+          echo "HEXAGON_SDK_ROOT=/opt/hexagon/$HEXSDK_VER"                                     >> "$GITHUB_ENV"
+          echo "HEXAGON_TOOLS_ROOT=/opt/hexagon/$HEXSDK_VER/tools/HEXAGON_Tools/$HEXTLS_VER"   >> "$GITHUB_ENV"
+          echo "DEFAULT_HLOS_ARCH=64"                                                          >> "$GITHUB_ENV"
+          echo "DEFAULT_TOOLS_VARIANT=toolv19"                                                 >> "$GITHUB_ENV"
+          echo "DEFAULT_NO_QURT_INC=0"                                                         >> "$GITHUB_ENV"
+          echo "DEFAULT_DSP_ARCH=v73"                                                          >> "$GITHUB_ENV"
+
+      - name: Update CMake presets
+        id: update_presets
+        if: ${{ matrix.build == 'arm64-snapdragon' }}
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .

      - name: Build
-        id: cmake_build
+        id: ndk_build
        run: |
-          cmake -B build \
-            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-            -DANDROID_ABI=arm64-v8a \
-            -DANDROID_PLATFORM=android-28 \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake ${{ matrix.defines }} -B build
+          cmake --build build
+          cmake --install build --prefix pkg-adb/llama.cpp
+
+      - name: Test
+        id: cmake_test
+        run: |
+          echo "FIXME: test on devices"
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -46,7 +46,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-ios
          evict-old-files: 1d
@@ -59,7 +59,6 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_COMMON=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
@@ -90,7 +89,6 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -126,7 +124,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-tvos
          evict-old-files: 1d
@@ -140,7 +138,6 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -166,7 +163,6 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
-            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -190,7 +186,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-swift
          evict-old-files: 1d
@@ -210,7 +206,6 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -63,7 +63,7 @@ jobs:
      - name: Set container image
        id: cann-image
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"

      - name: Pull container image
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -5,23 +5,23 @@ on:

 jobs:
  linux:
-    runs-on: [self-hosted, Linux, CPU]
+    runs-on: ubuntu-slim
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0

+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y build-essential tcl cmake
+
      - name: Build
        run: |
          PREFIX="$(pwd)"/inst
-          cmake -S . -B build \
-                -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF \
-                -DLLAMA_BUILD_TESTS=OFF \
-                -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF \
-                -DLLAMA_BUILD_APP=OFF \
-                -DCMAKE_BUILD_TYPE=Release
+          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
+                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
          cmake --build build --config Release
          cmake --install build --prefix "$PREFIX" --config Release

--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -246,7 +246,6 @@ jobs:
          apt-get install -y --no-install-recommends \
                  build-essential \
                  glslc \
-                  spirv-headers \
                  gcc-14-loongarch64-linux-gnu \
                  g++-14-loongarch64-linux-gnu \
                  libvulkan-dev:loong64
@@ -277,7 +276,7 @@ jobs:

    env:
      # Make sure this is in sync with build-cache.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"

    steps:
      - uses: actions/checkout@v6
@@ -301,17 +300,16 @@ jobs:
          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
          cmake -B build -DLLAMA_OPENSSL=OFF \
                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DGGML_CPU_REPACK=OFF \
                         -DLLAMA_BUILD_TOOLS=ON \
                         -DLLAMA_BUILD_TESTS=OFF \
                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
                         -DGGML_RVV=ON \
-                         -DGGML_RV_ZVFH=ON \
                         -DGGML_RV_ZFH=ON \
                         -DGGML_RV_ZICBOP=ON \
                         -DGGML_RV_ZIHINTPAUSE=ON \
-                         -DGGML_RV_ZBA=ON \
+                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake

          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-hip.yml
+++ b/.github/workflows/build-hip.yml
@@ -1,167 +0,0 @@
-name: CI (hip)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-hip.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-hip.yml',
-      'ggml/src/ggml-cuda/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-22-hip:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:6.1.2
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-hip
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with native CMake HIP support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGGML_HIP_ROCWMMA_FATTN=ON \
-            -DGPU_TARGETS="gfx1030" \
-            -DGGML_HIP=ON
-          cmake --build build --config Release -j $(nproc)
-
-  windows-latest-hip:
-    runs-on: windows-2022
-
-    env:
-      # Make sure this is in sync with build-cache.yml
-      HIPSDK_INSTALLER_VERSION: "26.Q1"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Grab rocWMMA package
-        id: grab_rocwmma
-        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
-          7z x rocwmma.deb
-          7z x data.tar
-
-      - name: Use ROCm Installation Cache
-        uses: actions/cache@v5
-        id: cache-rocm
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Setup ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-rocm
-        with:
-          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
-
-      - name: Verify ROCm
-        id: verify
-        run: |
-          # Find and test ROCm installation
-          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
-          if (-not $clangPath) {
-            Write-Error "ROCm installation not found"
-            exit 1
-          }
-          & $clangPath.FullName --version
-
-      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ${{ github.job }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
-          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . `
-            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
-            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
-            -DCMAKE_BUILD_TYPE=Release `
-            -DLLAMA_BUILD_BORINGSSL=ON `
-            -DROCM_DIR="${env:HIP_PATH}" `
-            -DGGML_HIP=ON `
-            -DGGML_HIP_ROCWMMA_FATTN=ON `
-            -DGPU_TARGETS="gfx1100"  `
-            -DGGML_RPC=ON
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
-
-  ubuntu-22-musa:
-    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          apt-get update
-          apt-get install -y build-essential git cmake libssl-dev
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-musa
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with native CMake MUSA support
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DGGML_MUSA=ON
-          time cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-ibm.yml
+++ b/.github/workflows/build-ibm.yml
@@ -1,150 +0,0 @@
-name: CI (ibm)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-ibm.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-ibm.yml',
-      'ggml/src/ggml-cpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-24-s390x:
-    runs-on: ubuntu-24.04-s390x
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Build Dependencies
-        id: build_depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
-            libjpeg-dev build-essential libssl-dev \
-            git-lfs
-
-      - name: Toolchain workaround (GCC 14)
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
-      - name: Python Dependencies
-        id: python_depends
-        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
-          pip3 install ./gguf-py
-
-      - name: Swap Endianness
-        id: endianness
-        run: |
-          for f in models/*.gguf; do
-            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
-          done
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c (s390x)
-        id: llama2c_test_s390x
-        run: |
-          cd build
-          echo "Fetch llama2c big-endian model"
-          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
-  ubuntu-24-ppc64le:
-    runs-on: ubuntu-24.04-ppc64le
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Build Dependencies
-        id: build_depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
-            libjpeg-dev build-essential libssl-dev \
-            git-lfs
-
-      - name: Toolchain workaround (GCC 14)
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
-      - name: Python Dependencies
-        id: python_depends
-        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
-          pip3 install ./gguf-py
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -43,7 +43,7 @@ jobs:
      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@cafece8e6baf9247cf9b1bf95097b0b983cc558d # v2
+        uses: msys2/setup-msys2@v2
        with:
          update: true
          msystem: ${{matrix.sys}}
--- a/.github/workflows/build-opencl.yml
+++ b/.github/workflows/build-opencl.yml
@@ -1,83 +0,0 @@
-name: CI (opencl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-opencl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-opencl.yml',
-      'ggml/src/ggml-opencl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  windows-latest-opencl-adreno:
-    runs-on: windows-2025
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-llvm-arm64-opencl-adreno
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
-      - name: Install OpenCL Headers and Libs
-        id: install_opencl
-        run: |
-          git clone https://github.com/KhronosGroup/OpenCL-Headers
-          cd OpenCL-Headers
-          cmake -B build `
-            -DBUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build --target install
-          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-          cd OpenCL-ICD-Loader
-          cmake -B build-arm64-release `
-            -A arm64 `
-            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
-          cmake --build build-arm64-release --target install --config release
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -1,120 +0,0 @@
-name: CI (openvino)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-openvino.yml',
-      'ggml/src/ggml-openvino/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-24-openvino:
-    name: ubuntu-24-openvino-${{ matrix.openvino_device }}
-
-    concurrency:
-      group: openvino-${{ matrix.variant }}-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
-    strategy:
-      matrix:
-        include:
-          - variant: cpu
-            runner: '"ubuntu-24.04"'
-            openvino_device: "CPU"
-          - variant: gpu
-            runner: '["self-hosted","Linux","Intel","OpenVINO"]'
-            openvino_device: "GPU"
-
-    runs-on: ${{ fromJSON(matrix.runner) }}
-
-    env:
-      # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
-      OPENVINO_VERSION_MAJOR: "2026.0"
-      OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        if: runner.environment == 'github-hosted'
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
-          sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
-
-      - name: Use OpenVINO Toolkit Cache
-        if: runner.environment == 'github-hosted'
-        uses: actions/cache@v5
-        id: cache-openvino
-        with:
-          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
-
-      - name: Setup OpenVINO Toolkit
-        if: steps.cache-openvino.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-openvino
-        with:
-          path: ./openvino_toolkit
-          version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
-          version_full: ${{ env.OPENVINO_VERSION_FULL }}
-
-      - name: Install OpenVINO dependencies
-        run: |
-          cd ./openvino_toolkit
-          chmod +x ./install_dependencies/install_openvino_dependencies.sh
-          echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source ./openvino_toolkit/setupvars.sh
-          cmake -B build/ReleaseOV -G Ninja \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENVINO=ON
-          time cmake --build build/ReleaseOV --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        # TODO: fix and re-enable the `test-llama-archs` test below
-        run: |
-          cd ${{ github.workspace }}
-          if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
-            export GGML_OPENVINO_DEVICE=GPU
-          fi
-          ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -34,78 +34,8 @@ env:
  LLAMA_LOG_TIMESTAMPS: 1

 jobs:
-  ubuntu-cpu-riscv64-native:
-    runs-on: ubuntu-24.04-riscv
-
-    steps:
-      - name: Install dependencies
-        run: |
-          # Install necessary packages
-          sudo apt-get update
-          sudo apt-get install -y libssl-dev
-
-          # Set gcc-14 and g++-14 as the default compilers
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
-          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
-
-          git lfs install
-
-      - name: Check environment
-        run: |
-          uname -a
-          gcc --version
-          g++ --version
-          ldd --version
-          cmake --version
-          rustc --version
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-        with:
-          key: ubuntu-cpu-riscv64-native
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_EXAMPLES=ON \
-            -DLLAMA_BUILD_TOOLS=ON \
-            -DLLAMA_BUILD_TESTS=ON \
-            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DGGML_RPC=ON \
-            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
-
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-      - name: Test llama2c conversion
-        id: llama2c_test
-        run: |
-          cd build
-          echo "Fetch tokenizer"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
-          echo "Fetch llama2c model"
-          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
-          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
-          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
  ubuntu-riscv64-native-sanitizer:
-    runs-on: ubuntu-24.04-riscv
+    runs-on: RISCV64

    continue-on-error: true

@@ -117,9 +47,20 @@ jobs:
    steps:
      - name: Install dependencies
        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential wget ccache git-lfs
+
          # Set gcc-14 and g++-14 as the default compilers
          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable

          git lfs install

@@ -132,12 +73,23 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: ccache
-        uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-        with:
-          key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+      - name: Setup ccache
+        run: |
+          # Unique cache directory per matrix combination
+          export CCACHE_DIR="$HOME/.ccache/sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}"
+          mkdir -p "$CCACHE_DIR"
+
+          # Configure ccache
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          # Export for subsequent steps
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV

      - name: Build
        id: cmake_build
--- a/.github/workflows/build-rpc.yml
+++ b/.github/workflows/build-rpc.yml
@@ -1,67 +0,0 @@
-name: CI (rpc)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-rpc.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-rpc.yml',
-      'ggml/src/ggml-rpc/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-latest-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev ninja-build
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -43,7 +43,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
          evict-old-files: 1d
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -67,7 +67,7 @@ jobs:
        id: ggml-ci
        run: |
          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm:
    runs-on: [self-hosted, Linux, NVIDIA]
@@ -81,7 +81,7 @@ jobs:
        id: ggml-ci
        run: |
          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-nvidia-vulkan-cm2:
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]
@@ -95,36 +95,7 @@ jobs:
        id: ggml-ci
        run: |
          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-nvidia-webgpu:
-    runs-on: [self-hosted, Linux, NVIDIA, X64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_WEBGPU=1 \
-          GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-          GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  # TODO: provision AMX-compatible machine
  #ggml-ci-cpu-amx:
@@ -138,7 +109,7 @@ jobs:
  #    - name: Test
  #      id: ggml-ci
  #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  #        bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  # TODO: provision AMD GPU machine
  # ggml-ci-amd-vulkan:
@@ -153,7 +124,7 @@ jobs:
  #       id: ggml-ci
  #       run: |
  #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  # TODO: provision AMD GPU machine
  # ggml-ci-amd-rocm:
@@ -168,7 +139,7 @@ jobs:
  #       id: ggml-ci
  #       run: |
  #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]
@@ -194,15 +165,16 @@ jobs:
      - name: Dawn Dependency
        id: dawn-depends
        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1

      - name: Test
        id: ggml-ci
@@ -240,34 +212,9 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-win-intel-vulkan:
-    runs-on: [self-hosted, Windows, X64, Intel]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        shell: C:\msys64\usr\bin\bash.exe --noprofile --norc -eo pipefail "{0}"
-        env:
-          MSYSTEM: UCRT64
-          CHERE_INVOKING: 1
-          PATH: C:\msys64\ucrt64\bin;C:\msys64\usr\bin;C:\Windows\System32;${{ env.PATH }}
-        run: |
-          vulkaninfo --summary
-          # Skip python related tests with GG_BUILD_LOW_PERF=1 since Windows MSYS2 UCRT64 currently fails to create
-          # a valid python environment for testing
-          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp
-
  ggml-ci-intel-openvino-gpu-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

-    concurrency:
-      group: openvino-gpu-${{ github.head_ref || github.ref }}
-      cancel-in-progress: false
-
    env:
      # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
      OPENVINO_VERSION_MAJOR: "2026.0"
@@ -295,62 +242,4 @@ jobs:
        id: ggml-ci
        run: |
          source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-arm64-cpu-low-perf:
-    runs-on: [self-hosted, Linux, ARM64, CPU]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-arm64-cpu-high-perf:
-    runs-on: [self-hosted, Linux, ARM64, CPU]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-# TODO: not sure how to detect ARM flags on DGX Spark. currently get this error during cmake:
-#         CMake Warning at ggml/src/ggml-cpu/CMakeLists.txt:147 (message):
-#           ARM -march/-mcpu not found, -mcpu=native will be used
-#
-#       if we resolve this, we should be able to offload these jobs to the self-hosted runners
-#
-#  ggml-ci-arm64-cpu-high-perf-sve:
-#    runs-on: [self-hosted, Linux, ARM64, CPU]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-#
-#  ggml-ci-arm64-cpu-kleidiai:
-#    runs-on: [self-hosted, Linux, ARM64, CPU]
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -1,158 +0,0 @@
-name: CI (sycl)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-sycl.yml',
-      'ggml/src/ggml-sycl/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32]
-        include:
-          - build: fp32
-            fp16: OFF
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build-virtgpu.yml
+++ b/.github/workflows/build-virtgpu.yml
@@ -1,50 +0,0 @@
-name: CI (virtgpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-virtgpu.yml',
-      'ggml/src/ggml-virtgpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu-24-virtgpu:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DGGML_VIRTGPU=ON \
-            -DGGML_VIRTGPU_BACKEND=ON
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -45,7 +45,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-vulkan-llvmpipe
          evict-old-files: 1d
@@ -72,7 +72,7 @@ jobs:

      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
+        uses: ./.github/actions/linux-setup-vulkan-llvmpipe
        with:
          path: ./vulkan_sdk
          version: ${{ env.VULKAN_SDK_VERSION }}
@@ -93,5 +93,4 @@ jobs:
          export GGML_VK_DISABLE_F16=1
          export GGML_VK_DISABLE_COOPMAT=1
          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
+          ctest -L main --verbose --timeout 4800
--- a/.github/workflows/build-webgpu.yml
+++ b/.github/workflows/build-webgpu.yml
@@ -1,186 +0,0 @@
-name: CI (webgpu)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-webgpu.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.wgsl'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-webgpu.yml',
-      'ggml/src/ggml-webgpu/**'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-
-  macOS-latest-arm64-webgpu:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: macOS-latest-arm64-webgpu
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export CMAKE_PREFIX_PATH=dawn
-          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-24-webgpu:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-webgpu
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
-          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers \
-            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Use Vulkan SDK Cache
-        uses: actions/cache@v5
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
-
-      - name: Dawn Dependency
-        id: dawn-depends
-        run: |
-          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v20260317.182325"
-          DAWN_OWNER="google"
-          DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          curl -L -o artifact.tar.gz \
-            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-          mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
-
-      - name: Build
-        id: cmake_build
-        run: |
-          export Dawn_DIR=dawn/lib64/cmake/Dawn
-          cmake -B build \
-            -DGGML_WEBGPU=ON
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          # This is using llvmpipe and runs slower than other backends
-          # test-backend-ops is too slow on llvmpipe, skip it
-          ctest -L main -E test-backend-ops --verbose --timeout 900
-
-  ubuntu-24-webgpu-wasm:
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Install Emscripten
-        run: |
-          git clone https://github.com/emscripten-core/emsdk.git
-          cd emsdk
-          ./emsdk install latest
-          ./emsdk activate latest
-
-      - name: Fetch emdawnwebgpu
-        run: |
-          DAWN_TAG="v20260317.182325"
-          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
-          echo "Downloading ${EMDAWN_PKG}"
-          curl -L -o emdawn.zip \
-            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
-          unzip emdawn.zip
-
-      - name: Build WASM WebGPU
-        run: |
-          source emsdk/emsdk_env.sh
-          emcmake cmake -B build-wasm \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_WEBGPU=ON \
-            -DLLAMA_OPENSSL=OFF \
-            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
-
-          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -69,7 +69,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-arm64
          evict-old-files: 1d
@@ -87,7 +87,7 @@ jobs:
            -DGGML_METAL_EMBED_LIBRARY=OFF \
            -DGGML_METAL_SHADER_DEBUG=ON \
            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1

      - name: Test
@@ -105,7 +105,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: macOS-latest-x64
          evict-old-files: 1d
@@ -124,7 +124,49 @@ jobs:
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
-          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  macOS-latest-arm64-webgpu:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-arm64-webgpu
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          mkdir dawn
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export CMAKE_PREFIX_PATH=dawn
+          cmake -B build -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: cmake_test
@@ -139,7 +181,11 @@ jobs:
          - build: 'x64'
            os: ubuntu-22.04
          - build: 'arm64'
-            os: ubuntu-24.04-arm
+            os: ubuntu-22.04-arm
+          - build: 's390x'
+            os: ubuntu-24.04-s390x
+          - build: 'ppc64le'
+            os: ubuntu-24.04-ppc64le

    runs-on: ${{ matrix.os }}

@@ -149,7 +195,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-${{ matrix.build }}
          evict-old-files: 1d
@@ -160,31 +206,31 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev python3-wheel \
+            python3 python3-pip python3-dev \
            libjpeg-dev build-essential libssl-dev \
            git-lfs

-      - name: Toolchain workaround (GCC 14)
-        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
      - name: Python Dependencies
        id: python_depends
        run: |
-          export PIP_BREAK_SYSTEM_PACKAGES="1"
-          python3 -m pip install --upgrade pip setuptools
+          python3 -m pip install --upgrade pip
          pip3 install ./gguf-py

+      - name: Swap Endianness
+        id: endianness
+        if: ${{ matrix.build == 's390x' }}
+        run: |
+          for f in models/*.gguf; do
+            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
+          done
+
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_RPC=ON
-          time cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

      - name: Test
        id: cmake_test
@@ -194,6 +240,7 @@ jobs:

      - name: Test llama2c conversion
        id: llama2c_test
+        if: ${{ matrix.build != 's390x' }}
        run: |
          cd build
          echo "Fetch tokenizer"
@@ -203,16 +250,19 @@ jobs:
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

-  ubuntu-24-vulkan:
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-24.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
+      - name: Test llama2c (s390x)
+        id: llama2c_test_s390x
+        if: ${{ matrix.build == 's390x' }}
+        run: |
+          cd build
+          echo "Fetch llama2c big-endian model"
+          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
+          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

-    runs-on: ${{ matrix.os }}
+  ubuntu-latest-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true

    steps:
      - name: Clone
@@ -223,15 +273,38 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
+          sudo apt-get install build-essential libssl-dev
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
+
+  ubuntu-24-vulkan:
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get install -y glslc libvulkan-dev libssl-dev

      - name: Configure
        id: cmake_configure
        run: |
          cmake -B build \
-            -G "Ninja" \
            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DGGML_BACKEND_DL=ON \
            -DGGML_CPU_ALL_VARIANTS=ON \
@@ -240,7 +313,351 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          time cmake --build build -j $(nproc)
+          cmake --build build -j $(nproc)
+
+  ubuntu-24-webgpu:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-24-webgpu
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+
+      - name: Get latest Vulkan SDK version
+        id: vulkan_sdk_version
+        run: |
+          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
+
+      - name: Use Vulkan SDK Cache
+        uses: actions/cache@v5
+        id: cache-sdk
+        with:
+          path: ./vulkan_sdk
+          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+
+      - name: Setup Vulkan SDK
+        if: steps.cache-sdk.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-vulkan
+        with:
+          path: ./vulkan_sdk
+          version: ${{ env.VULKAN_SDK_VERSION }}
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+          DAWN_VERSION="v2.0.0"
+          DAWN_OWNER="reeselevine"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          curl -L -o artifact.zip \
+            "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
+          mkdir dawn
+          unzip artifact.zip
+          tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build \
+            -DGGML_WEBGPU=ON
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          ctest -L main --verbose --timeout 900
+
+  ubuntu-24-webgpu-wasm:
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Install Emscripten
+        run: |
+          git clone https://github.com/emscripten-core/emsdk.git
+          cd emsdk
+          ./emsdk install latest
+          ./emsdk activate latest
+
+      - name: Fetch emdawnwebgpu
+        run: |
+          DAWN_TAG="v20251027.212519"
+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+          echo "Downloading ${EMDAWN_PKG}"
+          curl -L -o emdawn.zip \
+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+          unzip emdawn.zip
+
+      - name: Build WASM WebGPU
+        run: |
+          source emsdk/emsdk_env.sh
+          emcmake cmake -B build-wasm \
+            -DGGML_WEBGPU=ON \
+            -DLLAMA_OPENSSL=OFF \
+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+          cmake --build build-wasm --target test-backend-ops -j $(nproc)
+
+  ubuntu-22-hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.1.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-hip
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGGML_HIP=ON
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libssl-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-musa
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-sycl
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-22-sycl-fp16:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-22-sycl-fp16
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DGGML_SYCL_F16=ON
+          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-openvino:
+      name: ubuntu-24-openvino-${{ matrix.openvino_device }}
+      strategy:
+        matrix:
+          include:
+            - variant: cpu
+              runner: '"ubuntu-24.04"'
+              openvino_device: "CPU"
+            - variant: gpu
+              runner: '["self-hosted","Linux","X64","Intel"]'
+              openvino_device: "GPU"
+
+      runs-on: ${{ fromJSON(matrix.runner) }}
+
+      env:
+        # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile
+        OPENVINO_VERSION_MAJOR: "2026.0"
+        OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886"
+
+      steps:
+        - name: Clone
+          id: checkout
+          uses: actions/checkout@v6
+
+        - name: ccache
+          if: runner.environment == 'github-hosted'
+          uses: ggml-org/ccache-action@v1.2.16
+          with:
+            key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+            evict-old-files: 1d
+            save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+        - name: Dependencies
+          id: depends
+          run: |
+            sudo apt-get update
+            sudo apt-get install -y build-essential libssl-dev libtbb12 cmake ninja-build python3-pip
+            sudo apt-get install -y ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd
+
+        - name: Use OpenVINO Toolkit Cache
+          if: runner.environment == 'github-hosted'
+          uses: actions/cache@v5
+          id: cache-openvino
+          with:
+            path: ./openvino_toolkit
+            key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+
+        - name: Setup OpenVINO Toolkit
+          if: steps.cache-openvino.outputs.cache-hit != 'true'
+          uses: ./.github/actions/linux-setup-openvino
+          with:
+            path: ./openvino_toolkit
+            version_major: ${{ env.OPENVINO_VERSION_MAJOR }}
+            version_full: ${{ env.OPENVINO_VERSION_FULL }}
+
+        - name: Install OpenVINO dependencies
+          run: |
+            cd ./openvino_toolkit
+            chmod +x ./install_dependencies/install_openvino_dependencies.sh
+            echo "Y" | sudo -E ./install_dependencies/install_openvino_dependencies.sh
+
+        - name: Build
+          id: cmake_build
+          run: |
+            source ./openvino_toolkit/setupvars.sh
+            cmake -B build/ReleaseOV -G Ninja \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DGGML_OPENVINO=ON
+            cmake --build build/ReleaseOV --config Release -j $(nproc)
+
+        - name: Test
+          id: cmake_test
+          # TODO: fix and re-enable the `test-llama-archs` test below
+          run: |
+            cd ${{ github.workspace }}
+            if [ "${{ matrix.openvino_device }}" = "GPU" ]; then
+              export GGML_OPENVINO_DEVICE=GPU
+            fi
+            ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000

  windows-latest:
    runs-on: windows-2025
@@ -265,6 +682,9 @@ jobs:
          - build: 'llvm-arm64'
            arch: 'arm64'
            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+          - build: 'llvm-arm64-opencl-adreno'
+            arch: 'arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

    steps:
      - name: Clone
@@ -272,7 +692,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-${{ matrix.build }}
          variant: ccache
@@ -306,6 +726,26 @@ jobs:
        run: |
          choco install ninja

+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
      - name: Build
        id: cmake_build
        run: |
@@ -358,7 +798,7 @@ jobs:
              apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev

        - name: ccache
-          uses: ggml-org/ccache-action@v1.2.21
+          uses: ggml-org/ccache-action@v1.2.16
          with:
            key: ubuntu-latest-cuda
            evict-old-files: 1d
@@ -390,7 +830,7 @@ jobs:
        uses: actions/checkout@v6

      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -426,6 +866,197 @@ jobs:
          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
          cmake --build build --config Release

+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
+
+  windows-latest-hip:
+    runs-on: windows-2022
+
+    env:
+      # Make sure this is in sync with build-cache.yml
+      HIPSDK_INSTALLER_VERSION: "26.Q1"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Grab rocWMMA package
+        id: grab_rocwmma
+        run: |
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
+          7z x rocwmma.deb
+          7z x data.tar
+
+      - name: Use ROCm Installation Cache
+        uses: actions/cache@v5
+        id: cache-rocm
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Setup ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-rocm
+        with:
+          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+
+      - name: Install ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ${{ github.job }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_BUILD_BORINGSSL=ON `
+            -DROCM_DIR="${env:HIP_PATH}" `
+            -DGGML_HIP=ON `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+
+  ubuntu-cpu-riscv64-native:
+    runs-on: RISCV64
+
+    steps:
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+
+          # Install necessary packages
+          sudo apt-get install -y libatomic1 libtsan2 gcc-14 g++-14 rustup cmake build-essential libssl-dev wget ccache git-lfs
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+          sudo ln -sf /usr/bin/gcc-14 /usr/bin/gcc
+          sudo ln -sf /usr/bin/g++-14 /usr/bin/g++
+
+          # Install Rust stable version
+          rustup install stable
+          rustup default stable
+
+          git lfs install
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Check environment
+        run: |
+          uname -a
+          gcc --version
+          g++ --version
+          ldd --version
+          cmake --version
+          rustc --version
+
+      - name: Setup ccache
+        run: |
+          # Set unique cache directory for this job
+          export CCACHE_DIR="$HOME/.ccache/cpu-cmake-rv64-native"
+          mkdir -p "$CCACHE_DIR"
+
+          # Configure ccache for optimal performance
+          ccache --set-config=max_size=5G
+          ccache --set-config=compression=true
+          ccache --set-config=compression_level=6
+          ccache --set-config=cache_dir="$CCACHE_DIR"
+
+          # Enable more aggressive caching
+          ccache --set-config=sloppiness=file_macro,time_macros,include_file_mtime,include_file_ctime
+          ccache --set-config=hash_dir=false
+
+          # Export for subsequent steps
+          echo "CCACHE_DIR=$CCACHE_DIR" >> $GITHUB_ENV
+          echo "PATH=/usr/lib/ccache:$PATH" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DGGML_RPC=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
 # TODO: simplify the following workflows using a matrix
 # TODO: run lighter CI on PRs and the full CI only on master (if needed)
  ggml-ci-x64-cpu-low-perf:
@@ -437,7 +1068,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-x64-cpu-low-perf
          evict-old-files: 1d
@@ -454,32 +1085,31 @@ jobs:
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

-# note: moved to build-self-hosted.yml - can remove from here when everything is stable
-#  ggml-ci-arm64-cpu-low-perf:
-#    runs-on: ubuntu-22.04-arm
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: ggml-ci-arm64-cpu-low-perf
-#          evict-old-files: 1d
-#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-#
-#      - name: Dependencies
-#        id: depends
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+  ggml-ci-arm64-cpu-low-perf:
+    runs-on: ubuntu-22.04-arm
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-arm64-cpu-low-perf
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-x64-cpu-high-perf:
    runs-on: ubuntu-22.04
@@ -490,7 +1120,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-x64-cpu-high-perf
          evict-old-files: 1d
@@ -507,32 +1137,31 @@ jobs:
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

-# note: moved to build-self-hosted.yml - can remove from here when everything is stable
-#  ggml-ci-arm64-cpu-high-perf:
-#    runs-on: ubuntu-22.04-arm
-#
-#    steps:
-#      - name: Clone
-#        id: checkout
-#        uses: actions/checkout@v6
-#
-#      - name: ccache
-#        uses: ggml-org/ccache-action@v1.2.21
-#        with:
-#          key: ggml-ci-arm64-cpu-high-perf
-#          evict-old-files: 1d
-#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-#
-#      - name: Dependencies
-#        id: depends
-#        run: |
-#          sudo apt-get update
-#          sudo apt-get install build-essential
-#
-#      - name: Test
-#        id: ggml-ci
-#        run: |
-#          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+  ggml-ci-arm64-cpu-high-perf:
+    runs-on: ubuntu-22.04-arm
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ggml-ci-arm64-cpu-high-perf
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-arm64-cpu-high-perf-sve:
    runs-on: ubuntu-22.04-arm
@@ -543,7 +1172,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-high-perf-sve
          evict-old-files: 1d
@@ -569,7 +1198,7 @@ jobs:
         uses: actions/checkout@v6

       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.21
+         uses: ggml-org/ccache-action@v1.2.16
         with:
           key: ggml-ci-arm64-cpu-kleidiai
           evict-old-files: 1d
@@ -621,7 +1250,7 @@ jobs:
           sudo apt-get install -y cmake

       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.21
+         uses: ggml-org/ccache-action@v1.2.16
         with:
           key: ggml-ci-arm64-cpu-kleidiai-graviton4
           evict-old-files: 1d
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -19,7 +19,7 @@ on:

 jobs:
  check-vendor:
-    runs-on: [self-hosted, fast]
+    runs-on: ubuntu-slim

    steps:
      - name: Checkout
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -17,7 +17,7 @@ jobs:
    steps:
      - uses: actions/stale@v10
        with:
-          exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap,security"
+          exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -1,51 +0,0 @@
-name: Code Style Checker
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  model-naming:
-    runs-on: [self-hosted, fast]
-    steps:
-      - uses: actions/checkout@v6
-      - name: Check model naming conventions
-        run: |
-          python3 - << 'EOF'
-          import re, os, sys
-
-          pairs = re.findall(
-              r'case\s+(LLM_ARCH_\w+)\s*:\s*\n\s+return new (llama_model_\w+)\s*\(',
-              open("src/llama-model.cpp").read())
-
-          errors = []
-          for arch, cls in pairs:
-              suffix  = arch[len("LLM_ARCH_"):]
-              csuffix = cls[len("llama_model_"):]
-              fname   = csuffix.replace("_", "-") + ".cpp"
-
-              if not re.fullmatch(r'[A-Z][A-Z0-9_]*',   suffix):
-                  errors.append(f"{arch}: suffix not upper snake case, example: LLM_ARCH_MY_MODEL")
-
-              if not re.fullmatch(r'[a-z][a-z0-9_]*', csuffix):
-                  errors.append(f"{arch}: class suffix not lower snake case, example: llama_model_my_model")
-
-              elif suffix.lower() != csuffix:
-                  errors.append(f"{arch}: arch/class name mismatch, expected class 'llama_model_{suffix.lower()}' but got '{cls}'")
-
-              elif not os.path.isfile(f"src/models/{fname}"):
-                  errors.append(f"{arch}: expects model file name to be src/models/{fname}, but not found")
-
-          if errors:
-              print('\n'.join(f"  - {e}" for e in errors)); sys.exit(1)
-          print(f"OK: {len(pairs)} mappings validated.")
-          EOF
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@v6

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: copilot-setup-steps
          evict-old-files: 1d
@@ -52,5 +52,6 @@ jobs:
      - name: Install Python dependencies
        run: |
          python3 -m venv .venv
-          source .venv/bin/activate
+          .venv/bin/activate
          pip install -r requirements/requirements-all.txt -r tools/server/tests/requirements.txt
+          pip install flake8 pyright pre-commit
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -11,11 +11,6 @@ name: Publish Docker image

 on:
  workflow_dispatch: # allows manual triggering
-    inputs:
-      skip_s390x:
-        description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)"
-        type: boolean
-        default: false
  schedule:
    # Rebuild daily rather than on every push because it is expensive
    - cron: '12 4 * * *'
@@ -30,13 +25,186 @@ permissions:
  packages: write

 jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+
+    runs-on: ${{ matrix.config.runs_on }}
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          # Multi-stage build
+          # Note: the arm64 images are failing, which prevents the amd64 images from being built
+          # https://github.com/ggml-org/llama.cpp/issues/11888
+          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
+          - { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
+          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
+          - { tag: "rocm",   dockerfile: ".devops/rocm.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0 # preserve git history, so we can determine the build number
+
+      - name: Set up QEMU
+        if: ${{ matrix.config.tag != 's390x' }}
+        uses: docker/setup-qemu-action@v3
+        with:
+          image: tonistiigi/binfmt:qemu-v7.0.0-28
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
+        env:
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Determine image tag name
+        id: tag
+        shell: bash
+        run: |
+          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
+          REPO_NAME="${{ github.event.repository.name }}"
+          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
+
+          # list all tags possible
+          tags="${{ matrix.config.tag }}"
+          for tag in $tags; do
+              if [[ "$tag" == "cpu" ]]; then
+                  TYPE=""
+              else
+                  TYPE="-$tag"
+              fi
+              CACHETAGS="${PREFIX}buildcache${TYPE}"
+              FULLTAGS="${FULLTAGS:+$FULLTAGS,}${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
+              LIGHTTAGS="${LIGHTTAGS:+$LIGHTTAGS,}${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
+              SERVERTAGS="${SERVERTAGS:+$SERVERTAGS,}${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
+          done
+          echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
+          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
+          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
+          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "cache_output_tags=$CACHETAGS"  # print out for debugging
+          echo "full_output_tags=$FULLTAGS"  # print out for debugging
+          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
+          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
+      - name: Free Disk Space (Ubuntu)
+        if: ${{ matrix.config.free_disk_space == true }}
+        uses: ggml-org/free-disk-space@v1.3.1
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: false
+
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
+      - name: Build and push Full Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.full_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: full
+          provenance: false
+          build-args: |
+            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          # using github experimental cache
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
+
+      - name: Build and push Light Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.light_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: light
+          provenance: false
+          build-args: |
+            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          # using github experimental cache
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
+
+      - name: Build and push Server Docker image (tagged + versioned)
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          # tag list is generated from step above
+          tags: ${{ steps.tag.outputs.server_output_tags }}
+          file: ${{ matrix.config.dockerfile }}
+          target: server
+          provenance: false
+          build-args: |
+            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
+            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          # using github experimental cache
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
+          # return to this if the experimental github cache is having issues
+          #cache-to: type=local,dest=/tmp/.buildx-cache
+          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
+
  create_tag:
    name: Create and push git tag
-    runs-on: ubuntu-slim
+    runs-on: ubuntu-22.04
    permissions:
      contents: write
-    outputs:
-      source_tag: ${{ steps.srctag.outputs.name }}

    steps:
      - name: Clone
@@ -57,459 +225,3 @@ jobs:
        run: |
          git tag ${{ steps.srctag.outputs.name }} || exit 0
          git push origin ${{ steps.srctag.outputs.name }} || exit 0
-
-  prepare_matrices:
-    name: Prepare Docker matrices
-    runs-on: ubuntu-24.04
-    outputs:
-      build_matrix: ${{ steps.matrices.outputs.build_matrix }}
-      merge_matrix: ${{ steps.matrices.outputs.merge_matrix }}
-
-    steps:
-      - name: Generate build and merge matrices
-        id: matrices
-        shell: bash
-        env:
-          SKIP_S390X: ${{ inputs.skip_s390x || 'false' }}
-        run: |
-          set -euo pipefail
-
-          # Keep all build targets in one place and derive merge targets from it.
-          cat > build-matrix.json <<'JSON'
-          [
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.1.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "musa", "dockerfile": ".devops/musa.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "intel", "dockerfile": ".devops/intel.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" },
-            { "tag": "vulkan", "dockerfile": ".devops/vulkan.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" },
-            { "tag": "rocm", "dockerfile": ".devops/rocm.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" },
-            { "tag": "openvino", "dockerfile": ".devops/openvino.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }
-          ]
-          JSON
-
-          if [ "${SKIP_S390X}" = "true" ]; then
-            jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp
-            mv build-matrix.json.tmp build-matrix.json
-          fi
-
-          BUILD_MATRIX="$(jq -c . build-matrix.json)"
-          MERGE_MATRIX="$(jq -c '
-            reduce .[] as $entry ({}; .[$entry.tag] |= (
-              . // {
-                tag: $entry.tag,
-                arches: [],
-                full: false,
-                light: false,
-                server: false
-              }
-              | .full = (.full or ($entry.full // false))
-              | .light = (.light or ($entry.light // false))
-              | .server = (.server or ($entry.server // false))
-              | .arches += [($entry.platforms | sub("^linux/"; ""))]
-            ))
-            # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-            | if (has("cpu") and (((.cpu.arches // []) | index("s390x")) != null)) then
-                . + {
-                  s390x: {
-                    tag: "s390x",
-                    arches: ["s390x"],
-                    full: .cpu.full,
-                    light: .cpu.light,
-                    server: .cpu.server
-                  }
-                }
-              else
-                .
-              end
-            | [.[] | .arches = (.arches | unique | sort | join(" "))]
-          ' build-matrix.json)"
-
-          echo "build_matrix=$BUILD_MATRIX" >> "$GITHUB_OUTPUT"
-          echo "merge_matrix=$MERGE_MATRIX" >> "$GITHUB_OUTPUT"
-
-  push_to_registry:
-    name: Push Docker image to Docker Registry
-    needs: [prepare_matrices, create_tag]
-
-    runs-on: ${{ matrix.config.runs_on }}
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
-    steps:
-      - name: Check out the repo
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ needs.create_tag.outputs.source_tag }}
-
-      - name: Set up QEMU
-        if: ${{ contains(matrix.config.platforms, 'linux/amd64') }}
-        uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4
-        with:
-          image: tonistiigi/binfmt:qemu-v10.2.1
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Determine image metadata
-        id: meta
-        shell: bash
-        run: |
-          set -euo pipefail
-
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          PLATFORM="${{ matrix.config.platforms }}"
-          ARCH_SUFFIX="${PLATFORM#linux/}"
-
-          # list all tags possible
-          tags="${{ matrix.config.tag }}"
-          for tag in $tags; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-              CACHETAG="${PREFIX}buildcache${TYPE}-${ARCH_SUFFIX}"
-          done
-
-          SAFE_TAGS="$(echo "$tags" | tr ' ' '_')"
-
-          echo "image_repo=$IMAGE_REPO" >> $GITHUB_OUTPUT
-          echo "arch_suffix=$ARCH_SUFFIX" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG" >> $GITHUB_OUTPUT
-          echo "digest_artifact_suffix=${SAFE_TAGS}-${ARCH_SUFFIX}" >> $GITHUB_OUTPUT
-          echo "cache_output_tag=$CACHETAG"  # print out for debugging
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
-      - name: Get build date
-        id: build_date
-        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
-
-      - name: Free Disk Space (Ubuntu)
-        if: ${{ matrix.config.free_disk_space == true }}
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Build and push Full Docker image by digest
-        id: build_full
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
-        with:
-          context: .
-          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
-          file: ${{ matrix.config.dockerfile }}
-          target: full
-          provenance: false
-          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
-
-      - name: Build and push Light Docker image by digest
-        id: build_light
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
-        with:
-          context: .
-          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
-          file: ${{ matrix.config.dockerfile }}
-          target: light
-          provenance: false
-          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
-
-      - name: Build and push Server Docker image by digest
-        id: build_server
-        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
-        uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7
-        with:
-          context: .
-          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
-          file: ${{ matrix.config.dockerfile }}
-          target: server
-          provenance: false
-          build-args: |
-            BUILD_DATE=${{ steps.build_date.outputs.date }}
-            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
-            APP_REVISION=${{ steps.checkout.outputs.commit }}
-            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
-            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
-            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
-            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
-          annotations: |
-            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
-            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
-            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
-            manifest:org.opencontainers.image.title=llama.cpp
-            manifest:org.opencontainers.image.description=LLM inference in C/C++
-            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
-            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
-          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
-          # return to this if the experimental github cache is having issues
-          #cache-to: type=local,dest=/tmp/.buildx-cache
-          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }}
-          cache-to: type=registry,ref=${{ steps.meta.outputs.cache_output_tag }},mode=max
-
-      - name: Export digest metadata
-        shell: bash
-        run: |
-            set -euo pipefail
-
-            TAGS="${{ matrix.config.tag }}"
-            ARCH_SUFFIX="${{ steps.meta.outputs.arch_suffix }}"
-            DIGEST_FILE="/tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv"
-            mkdir -p /tmp/digests
-
-            add_digest_rows() {
-                local image_type="$1"
-                local digest="$2"
-
-                if [[ -z "$digest" ]]; then
-                  echo "Missing digest for image_type=${image_type}" >&2
-                  exit 1
-                fi
-
-                for tag in $TAGS; do
-                    printf '%s\t%s\t%s\t%s\n' "$tag" "$ARCH_SUFFIX" "$image_type" "$digest" >> "$DIGEST_FILE"
-                done
-            }
-
-            if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                add_digest_rows "full" "${{ steps.build_full.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                add_digest_rows "light" "${{ steps.build_light.outputs.digest }}"
-            fi
-
-            if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                add_digest_rows "server" "${{ steps.build_server.outputs.digest }}"
-            fi
-
-      - name: Upload digest metadata
-        uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7
-        with:
-          name: digests-${{ steps.meta.outputs.digest_artifact_suffix }}
-          path: /tmp/digests/${{ steps.meta.outputs.digest_artifact_suffix }}.tsv
-          if-no-files-found: error
-
-  merge_arch_tags:
-    name: Create shared tags from digests
-    needs: [prepare_matrices, push_to_registry, create_tag]
-    runs-on: ubuntu-24.04
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJSON(needs.prepare_matrices.outputs.merge_matrix) }}
-
-    steps:
-      - name: Check out the repo
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Get build date
-        id: build_date
-        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
-
-      - name: Download digest metadata
-        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
-        with:
-          pattern: digests-*
-          path: /tmp/digests
-          merge-multiple: true
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4
-
-      - name: Log in to Docker Registry
-        uses: docker/login-action@b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Create tags from digests
-        shell: bash
-        run: |
-          set -euo pipefail
-
-          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
-          REPO_NAME="${{ github.event.repository.name }}"
-          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
-          PREFIX="${IMAGE_REPO}:"
-          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
-          BUILD_DATE="${{ steps.build_date.outputs.date }}"
-          COMMIT_SHA="${{ steps.checkout.outputs.commit }}"
-          TAGS="${{ matrix.config.tag }}"
-          ARCHES="${{ matrix.config.arches }}"
-          DIGEST_GLOB="/tmp/digests/*.tsv"
-
-          if ! ls ${DIGEST_GLOB} >/dev/null 2>&1; then
-              echo "No digest metadata found in /tmp/digests" >&2
-              exit 1
-          fi
-
-          if [[ -z "$SRC_TAG" ]]; then
-              echo "Missing source tag from create_tag" >&2
-              exit 1
-          fi
-
-          find_digest() {
-              local tag_name="$1"
-              local arch="$2"
-              local image_type="$3"
-              local digest
-
-              digest="$(awk -F '\t' -v t="$tag_name" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-
-              # Backward compatibility: s390x tags are aliases of cpu for the linux/s390x platform.
-              if [[ -z "$digest" && "$tag_name" == "s390x" && "$arch" == "s390x" ]]; then
-                digest="$(awk -F '\t' -v t="cpu" -v a="$arch" -v i="$image_type" '$1 == t && $2 == a && $3 == i { print $4; exit }' ${DIGEST_GLOB})"
-              fi
-
-              if [[ -z "$digest" ]]; then
-                echo "Missing digest for tag=${tag_name} arch=${arch} image_type=${image_type}" >&2
-                exit 1
-              fi
-
-              echo "$digest"
-          }
-
-          create_manifest_tags() {
-              local image_type="$1"
-              local tag_name="$2"
-              local suffix="$3"
-
-              local merged_tag="${PREFIX}${image_type}${suffix}"
-              local merged_versioned_tag="${merged_tag}-${SRC_TAG}"
-
-              local refs=()
-
-              for arch in $ARCHES; do
-                  local digest
-                  digest="$(find_digest "$tag_name" "$arch" "$image_type")"
-                  refs+=("${IMAGE_REPO}@${digest}")
-              done
-
-              local annotations=(
-                  --annotation "index:org.opencontainers.image.created=${BUILD_DATE}"
-                  --annotation "index:org.opencontainers.image.version=${SRC_TAG}"
-                  --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}"
-                  --annotation "index:org.opencontainers.image.title=llama.cpp"
-                  --annotation "index:org.opencontainers.image.description=LLM inference in C/C++"
-                  --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}"
-                  --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}"
-              )
-
-              echo "Creating ${merged_tag} from ${refs[*]}"
-              docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}"
-
-              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
-              docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}"
-          }
-
-          for tag in $TAGS; do
-              if [[ "$tag" == "cpu" ]]; then
-                  TYPE=""
-              else
-                  TYPE="-$tag"
-              fi
-
-              if [[ "${{ matrix.config.full }}" == "true" ]]; then
-                  create_manifest_tags "full" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.light }}" == "true" ]]; then
-                  create_manifest_tags "light" "$tag" "$TYPE"
-              fi
-
-              if [[ "${{ matrix.config.server }}" == "true" ]]; then
-                  create_manifest_tags "server" "$tag" "$TYPE"
-              fi
-          done
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -2,6 +2,11 @@ name: EditorConfig Checker

 on:
  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
  push:
    branches:
      - master
@@ -15,10 +20,10 @@ concurrency:

 jobs:
  editorconfig:
-    runs-on: [self-hosted, fast]
+    runs-on: ubuntu-slim
    steps:
      - uses: actions/checkout@v6
-      - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
        with:
          version: v3.0.3
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -28,17 +28,17 @@ jobs:
    - name: Set up Python
      uses: actions/setup-python@v6
      with:
-        python-version: '3.11'
-        pip-install: poetry==2.4.0
+        python-version: '3.9.x'
    - name: Install dependencies
      run: |
        cd gguf-py
+        python -m pip install poetry
        poetry install

    - name: Build package
      run: cd gguf-py && poetry build
    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # release/v1
+      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
        packages-dir: gguf-py/dist
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -1,82 +0,0 @@
-name: HIP quality check
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/hip-quality-check.yml',
-      '**/*.cu',
-      '**/*.cuh',
-      'scripts/hip/gcn-cdna-vgpr-check.py'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-
-jobs:
-  ubuntu-22-hip-quality-check:
-    runs-on: ubuntu-22.04
-    container: rocm/dev-ubuntu-22.04:7.2.1
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev python3
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-hip-quality-check
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build with Werror
-        id: cmake_build
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx942 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=Off \
-            -DCMAKE_HIP_FLAGS="-Werror -Wno-tautological-compare" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc)
-
-      - name: Check for major VGPR spills
-        id: vgpr_check
-        run: |
-          cmake -B build -S . \
-            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-            -DGPU_TARGETS=gfx908 \
-            -DGGML_HIP=ON \
-            -DGGML_HIP_EXPORT_METRICS=On \
-            -DCMAKE_HIP_FLAGS="" \
-            -DCMAKE_BUILD_TYPE=Release
-          cd build
-          make -j $(nproc) 2>&1 | tee metrics.log | grep -v 'Rpass-analysis=kernel-resource-usage\|remark:\|^$'
-          python3 ../scripts/hip/gcn-cdna-vgpr-check.py metrics.log
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -3,16 +3,16 @@ name: Check Pre-Tokenizer Hashes
 on:
    push:
        paths:
-            - 'conversion/base.py'
+            - 'convert_hf_to_gguf.py'
            - 'convert_hf_to_gguf_update.py'
    pull_request:
        paths:
-            - 'conversion/base.py'
+            - 'convert_hf_to_gguf.py'
            - 'convert_hf_to_gguf_update.py'

 jobs:
    pre-tokenizer-hashes:
-        runs-on: [self-hosted, fast]
+        runs-on: ubuntu-slim

        steps:
        - name: Checkout repository
@@ -30,16 +30,16 @@ jobs:

        - name: Update pre-tokenizer hashes
          run: |
-              cp conversion/base.py /tmp
+              cp convert_hf_to_gguf.py /tmp
              .venv/bin/python convert_hf_to_gguf_update.py --check-missing

        - name: Check if committed pre-tokenizer hashes matches generated version
          run: |
-              if ! diff -q conversion/base.py /tmp/base.py; then
-                  echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
+              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
+                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
                  echo "Differences found:"
-                  diff conversion/base.py /tmp/base.py || true
+                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
                  exit 1
              fi
              echo "Model pre-tokenizer hashes are up to date."
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -20,7 +20,7 @@ concurrency:

 jobs:
  python-check-requirements:
-    runs-on: [self-hosted, CPU, fast]
+    runs-on: ubuntu-slim
    name: check-requirements
    steps:
      - name: Check out source repository
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -21,7 +21,7 @@ concurrency:

 jobs:
  flake8-lint:
-    runs-on: [self-hosted, fast]
+    runs-on: ubuntu-slim
    name: Lint
    steps:
      - name: Check out source repository
@@ -31,6 +31,6 @@ jobs:
        with:
          python-version: "3.11"
      - name: flake8 Lint
-        uses: py-actions/flake8@84ec6726560b6d5bd68f2a5bed83d62b52bb50ba # v2
+        uses: py-actions/flake8@v2
        with:
            plugins: "flake8-no-print"
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -4,17 +4,15 @@ on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
-      - 'ty.toml'
+      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
-      # - 'pyrightconfig.json'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -22,8 +20,8 @@ concurrency:

 jobs:
  python-type-check:
-    runs-on: [self-hosted, fast]
-    name: python type-check
+    runs-on: ubuntu-latest
+    name: pyright type-check
    steps:
      - name: Check out source repository
        uses: actions/checkout@v6
@@ -31,13 +29,10 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.35
-      # - name: Type-check with Pyright
-      #   uses: jakebailey/pyright-action@v2
-      #   with:
-      #     version: 1.1.382
-      #     level: warning
-      #     warnings: true
-      - name: Type-check with ty
-        run: |
-            ty check --output-format=github
+          pip-install: -r requirements/requirements-all.txt
+      - name: Type-check with Pyright
+        uses: jakebailey/pyright-action@v2
+        with:
+          version: 1.1.382
+          level: warning
+          warnings: true
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -36,28 +36,8 @@ env:
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"

 jobs:
-
-  macOS-cpu:
-
-    strategy:
-      matrix:
-        include:
-          - build: 'arm64'
-            arch: 'arm64'
-            os: macos-14
-            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON"
-          - build: 'arm64-kleidiai'
-            arch: 'arm64'
-            os: macos-14
-            defines: "-DGGML_METAL_USE_BF16=ON -DGGML_METAL_EMBED_LIBRARY=ON -DGGML_CPU_KLEIDIAI=ON"
-          - build: 'x64'
-            arch: 'x64'
-            os: macos-15-intel
-            # Metal is disabled on x64 due to intermittent failures with Github runners not having a GPU:
-            # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-            defines: "-DGGML_METAL=OFF -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3"
-
-    runs-on: ${{ matrix.os }}
+  macOS-arm64:
+    runs-on: macos-14

    steps:
      - name: Clone
@@ -66,17 +46,10 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: macOS-latest-${{ matrix.arch }}
+          key: macOS-latest-arm64
          evict-old-files: 1d

      - name: Build
@@ -84,11 +57,13 @@ jobs:
        run: |
          sysctl -a
          cmake -B build \
-            ${{ matrix.defines }} \
            -DCMAKE_INSTALL_RPATH='@loader_path' \
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON \
            ${{ env.CMAKE_ARGS }}
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

@@ -100,25 +75,73 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz -s ",^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-macos-${{ matrix.build }}.tar.gz
-          name: llama-bin-macos-${{ matrix.build }}.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz
+          name: llama-bin-macos-arm64.tar.gz

-  ubuntu-cpu:
+  macOS-x64:
+    runs-on: macos-15-intel

+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: macOS-latest-x64
+          evict-old-files: 1d
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_INSTALL_RPATH='@loader_path' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz
+          name: llama-bin-macos-x64.tar.gz
+
+  ubuntu-22-cpu:
    strategy:
      matrix:
        include:
          - build: 'x64'
            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
          - build: 's390x'
            os: ubuntu-24.04-s390x
+          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
+          # - build: 'arm64'
+          #   os: ubuntu-22.04-arm

    runs-on: ${{ matrix.os }}

@@ -129,16 +152,8 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        if: ${{ matrix.build != 's390x' }}
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-cpu-${{ matrix.build }}
          evict-old-files: 1d
@@ -149,13 +164,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential libssl-dev

-      - name: Toolchain workaround (GCC 14)
-        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
-        run: |
-          sudo apt-get install -y gcc-14 g++-14
-          echo "CC=gcc-14" >> "$GITHUB_ENV"
-          echo "CXX=g++-14" >> "$GITHUB_ENV"
-
      - name: Build
        id: cmake_build
        run: |
@@ -177,7 +185,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -185,17 +193,8 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz
          name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz

-  ubuntu-vulkan:
-
-    strategy:
-      matrix:
-        include:
-          - build: 'x64'
-            os: ubuntu-22.04
-          - build: 'arm64'
-            os: ubuntu-24.04-arm
-
-    runs-on: ${{ matrix.os }}
+  ubuntu-22-vulkan:
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
@@ -204,33 +203,19 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-vulkan-${{ matrix.build }}
+          key: ubuntu-22-vulkan
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
-          if [[ "${{ matrix.os }}" =~ "ubuntu-22.04" ]]; then
-            wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
-            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
-            sudo apt-get update -y
-            sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev
-          else
-            sudo apt-get update -y
-            sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
-            echo "CC=gcc-14" >> "$GITHUB_ENV"
-            echo "CXX=g++-14" >> "$GITHUB_ENV"
-          fi
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libssl-dev

      - name: Build
        id: cmake_build
@@ -253,93 +238,15 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz
-
-  android-arm64:
-
-    runs-on: ubuntu-latest
-
-    env:
-      NDK_VERSION: "29.0.14206865"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: android-arm64
-          evict-old-files: 1d
-
-      - name: Set up JDK
-        uses: actions/setup-java@v5
-        with:
-          java-version: 17
-          distribution: temurin
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Install NDK
-        run: |
-          sdkmanager "ndk;${{ env.NDK_VERSION }}"
-          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build \
-            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
-            -DANDROID_ABI=arm64-v8a \
-            -DANDROID_PLATFORM=android-28 \
-            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-            -DGGML_BACKEND_DL=ON \
-            -DGGML_NATIVE=OFF \
-            -DGGML_CPU_ALL_VARIANTS=ON \
-            -DLLAMA_FATAL_WARNINGS=ON \
-            -DGGML_OPENMP=OFF \
-            -DLLAMA_BUILD_BORINGSSL=ON \
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz
-          name: llama-bin-android-arm64.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz
+          name: llama-bin-ubuntu-vulkan-x64.tar.gz

  ubuntu-24-openvino:
-
    runs-on: ubuntu-24.04

    outputs:
@@ -361,15 +268,8 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-24-openvino-release-no-preset-v1
          evict-old-files: 1d
@@ -418,7 +318,7 @@ jobs:
        id: pack_artifacts
        run: |
          cp LICENSE ./build/ReleaseOV/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/ReleaseOV/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -427,7 +327,6 @@ jobs:
          name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz

  windows-cpu:
-
    runs-on: windows-2025

    strategy:
@@ -442,15 +341,8 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-cpu-${{ matrix.arch }}
          variant: ccache
@@ -487,7 +379,6 @@ jobs:
          name: llama-bin-win-cpu-${{ matrix.arch }}.zip

  windows:
-
    runs-on: windows-2025

    env:
@@ -511,15 +402,8 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-${{ matrix.backend }}-${{ matrix.arch }}
          variant: ccache
@@ -577,7 +461,6 @@ jobs:
          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip

  windows-cuda:
-
    runs-on: windows-2022

    strategy:
@@ -589,15 +472,8 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Install ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-cuda-${{ matrix.cuda }}
          variant: ccache
@@ -656,7 +532,6 @@ jobs:
          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip

  windows-sycl:
-
    runs-on: windows-2022

    defaults:
@@ -664,51 +539,26 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-sycl
          variant: ccache
          evict-old-files: 1d

+      - name: Install
+        run:  |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
      - name: Build
        id: cmake_build
        shell: cmd
@@ -736,13 +586,6 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
-          ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true)
-          if [ -n "$ZE_LOADER_DLL" ]; then
-            echo "Using Level Zero loader: $ZE_LOADER_DLL"
-            cp "$ZE_LOADER_DLL" ./build/bin
-          else
-            echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime"
-          fi

          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
@@ -768,107 +611,14 @@ jobs:
          path: llama-bin-win-sycl-x64.zip
          name: llama-bin-win-sycl-x64.zip

-  ubuntu-24-sycl:
-
-    strategy:
-      matrix:
-        build: [fp32]
-        include:
-          - build: fp32
-            fp16: OFF
-
-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        run: |
-          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v6
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
-
  ubuntu-22-rocm:
-
    runs-on: ubuntu-22.04

    strategy:
      matrix:
        include:
-          - ROCM_VERSION: "7.2.1"
-            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1150;gfx1200;gfx1201"
+          - ROCM_VERSION: "7.2"
+            gpu_targets: "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
            build: 'x64'

    steps:
@@ -878,20 +628,8 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
          evict-old-files: 1d
@@ -902,7 +640,7 @@ jobs:
          sudo apt install -y build-essential git cmake wget

      - name: Setup Legacy ROCm
-        if: matrix.ROCM_VERSION == '7.2.1'
+        if: matrix.ROCM_VERSION == '7.2'
        id: legacy_env
        run: |
          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
@@ -923,7 +661,7 @@ jobs:
          sudo apt-get install -y libssl-dev rocm-hip-sdk

      - name: Setup TheRock
-        if: matrix.ROCM_VERSION != '7.2.1'
+        if: matrix.ROCM_VERSION != '7.2'
        id: therock_env
        run: |
          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
@@ -939,6 +677,7 @@ jobs:
        run: |
          cmake -B build -S . \
            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
            -DCMAKE_BUILD_TYPE=Release \
            -DGGML_BACKEND_DL=ON \
            -DGGML_NATIVE=OFF \
@@ -956,23 +695,19 @@ jobs:
        id: tag
        uses: ./.github/actions/get-tag-name

-      - name: Get ROCm short version
-        run: echo "ROCM_VERSION_SHORT=$(echo '${{ matrix.ROCM_VERSION }}' | cut -d '.' -f 1,2)" >> $GITHUB_ENV
-
      - name: Pack artifacts
        id: pack_artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
-          name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz
+          name: llama-bin-ubuntu-rocm-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}.tar.gz

  windows-hip:
-
    runs-on: windows-2022

    env:
@@ -989,17 +724,10 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Grab rocWMMA package
        id: grab_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
          7z x rocwmma.deb
          7z x data.tar

@@ -1011,7 +739,7 @@ jobs:
          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: windows-latest-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
          evict-old-files: 1d
@@ -1056,7 +784,7 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
@@ -1106,7 +834,6 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
-            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -1179,7 +906,7 @@ jobs:
      - name: Set container image
        id: cann-image
        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc2-910b-openeuler24.03-py3.11' || '8.3.rc2-310p-openeuler24.03-py3.11' }}"
          echo "image=${image}" >> "${GITHUB_OUTPUT}"

      - name: Pull container image
@@ -1224,7 +951,7 @@ jobs:
      - name: Pack artifacts
        run: |
          cp LICENSE ./build/bin/
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts
        uses: actions/upload-artifact@v6
@@ -1232,9 +959,6 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz

-  ui-build:
-    uses: ./.github/workflows/ui-build.yml
-
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@@ -1252,18 +976,13 @@ jobs:
      - windows-sycl
      - windows-hip
      - ubuntu-22-rocm
-      - ubuntu-cpu
-      - ubuntu-vulkan
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
      - ubuntu-24-openvino
-      - ubuntu-24-sycl
-      - android-arm64
-      - macOS-cpu
+      - macOS-arm64
+      - macOS-x64
      - ios-xcode-build
      - openEuler-cann
-      - ui-build
-
-    outputs:
-      tag_name: ${{ steps.tag.outputs.name }}

    steps:
      - name: Clone
@@ -1320,18 +1039,6 @@ jobs:
          mv -v artifact/*.zip release
          mv -v artifact/*.tar.gz release

-      - name: Download UI build
-        id: download_ui
-        uses: actions/download-artifact@v7
-        with:
-          name: ui-build
-          path: ./ui-dist
-
-      - name: Package UI
-        id: package_ui
-        run: |
-          tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist .
-
      - name: Create release
        id: create_release
        uses: ggml-org/action-create-release@v1
@@ -1348,22 +1055,15 @@ jobs:

            **macOS/iOS:**
            - [macOS Apple Silicon (arm64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz)
-            - [macOS Apple Silicon (arm64, KleidiAI enabled)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-arm64-kleidiai.tar.gz)
            - [macOS Intel (x64)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz)
            - [iOS XCFramework](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-xcframework.zip)

            **Linux:**
            - [Ubuntu x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.tar.gz)
-            - [Ubuntu arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-arm64.tar.gz)
-            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
            - [Ubuntu x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz)
-            - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
            - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
+            - [Ubuntu s390x (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-s390x.tar.gz)
            - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
-            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
-
-            **Android:**
-            - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)

            **Windows:**
            - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip)
@@ -1380,9 +1080,6 @@ jobs:
            - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
            - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)

-            **UI:**
-            - [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
-
      - name: Upload release
        id: upload_release
        uses: actions/github-script@v8
@@ -1404,15 +1101,3 @@ jobs:
                });
              }
            }
-
-  ui-publish:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    needs:
-      - release
-
-    uses: ./.github/workflows/ui-publish.yml
-    with:
-      version_tag: ${{ needs.release.outputs.tag_name }}
-    secrets:
-      hf_token: ${{ secrets.HF_TOKEN_UI_STATIC_OUTPUT }}
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -67,13 +67,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -67,13 +67,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
      - name: Build
        id: cmake_build
        run: |
@@ -116,73 +109,8 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
-
-      - name: Tests
-        id: server_integration_tests
-        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-        run: |
-          cd tools/server/tests
-          python3 -m venv venv
-          source venv/bin/activate
-          pip install -r requirements.txt
-          export ${{ matrix.extra_args }}
-          pytest -v -x -m "not slow"
-
-  server-kleidiai:
-    runs-on: ah-ubuntu_22_04-c8g_8x
-
-    name: server-kleidiai (${{ matrix.wf_name }})
-    strategy:
-      matrix:
-        include:
-          - build_type: Release
-            extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
-            extra_args: ""
-            wf_name:    "CPUx1, kleidiai"
-      fail-fast: false
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Dependencies
-        id: depends
-        run: |
-          set -euxo pipefail
-          sudo apt-get update
-          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
-          apt-get install -y \
-           build-essential \
-           libssl-dev \
-           python3-venv \
-           gpg \
-           wget \
-           time \
-           git-lfs
-
-          git lfs install
-
-          # install the latest cmake
-          sudo install -d /usr/share/keyrings
-          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
-           | gpg --dearmor \
-           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
-          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
-           | sudo tee /etc/apt/sources.list.d/kitware.list
-          sudo apt-get update
-          sudo apt-get install -y cmake
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
+          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server

      - name: Tests
        id: server_integration_tests
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -1,7 +1,7 @@
-name: UI
+name: Server WebUI

 on:
-  workflow_dispatch:
+  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
@@ -11,18 +11,18 @@ on:
    branches:
      - master
    paths: [
-      '.github/workflows/ui.yml',
-      '.github/workflows/ui-build.yml',
-      'tools/ui/**.*',
-      'tools/server/tests/**.*'
+      '.github/workflows/server-webui.yml',
+      'tools/server/webui/**.*',
+      'tools/server/tests/**.*',
+      'tools/server/public/**'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
-      '.github/workflows/ui.yml',
-      '.github/workflows/ui-build.yml',
-      'tools/ui/**.*',
-      'tools/server/tests/**.*'
+      '.github/workflows/server-webui.yml',
+      'tools/server/webui/**.*',
+      'tools/server/tests/**.*',
+      'tools/server/public/**'
    ]

 env:
@@ -36,14 +36,9 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  ui-build:
-    name: Build static output
-    uses: ./.github/workflows/ui-build.yml
-
-  ui-checks:
-    name: Checks
-    needs: ui-build
-    runs-on: ubuntu-latest
+  webui-check:
+    name: WebUI Checks
+    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -56,89 +51,58 @@ jobs:
        id: node
        uses: actions/setup-node@v6
        with:
-          node-version: "24"
+          node-version: "22"
          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
+          cache-dependency-path: "tools/server/webui/package-lock.json"

      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
        run: npm ci
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Run linting
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run lint
-        working-directory: tools/ui
-
-      - name: Install Playwright browsers
-        id: playwright
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npx playwright install --with-deps
-        working-directory: tools/ui
-
-      - name: Run Client tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:client
-        working-directory: tools/ui
-
-      - name: Run Unit tests
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run test:unit
-        working-directory: tools/ui
-
-  e2e-tests:
-    name: E2E Tests
-    needs: ui-build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        id: node
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: Install dependencies
-        id: setup
-        if: ${{ steps.node.conclusion == 'success' }}
-        run: npm ci
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Build application
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run build
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Install Playwright browsers
        id: playwright
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npx playwright install --with-deps
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Build Storybook
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run build-storybook
-        working-directory: tools/ui
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Unit tests
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run test:unit
+        working-directory: tools/server/webui

      - name: Run UI tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
-        working-directory: tools/ui
+        working-directory: tools/server/webui

      - name: Run E2E tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
-        working-directory: tools/ui
+        working-directory: tools/server/webui
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -54,13 +54,8 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  ui-build:
-    name: Build Web UI
-    uses: ./.github/workflows/ui-build.yml
-
  server:
    runs-on: ubuntu-latest
-    needs: ui-build

    name: server (${{ matrix.wf_name }})
    strategy:
@@ -98,12 +93,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Download built UI
-        uses: actions/download-artifact@v7
-        with:
-          name: ui-build
-          path: tools/ui/dist
-
      - name: Build
        id: cmake_build
        run: |
@@ -146,11 +135,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -1,43 +0,0 @@
-name: UI Build
-
-on:
-  workflow_call:
-
-jobs:
-  build:
-    runs-on: [self-hosted, fast]
-    env:
-      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
-        with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
-
-      - name: Install dependencies
-        run: npm ci
-        working-directory: tools/ui
-
-      - name: Build application
-        run: npm run build
-        working-directory: tools/ui
-
-      - name: Generate checksums
-        run: |
-          cd tools/ui/dist
-          for f in *; do
-            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
-          done
-
-      - name: Upload built UI
-        uses: actions/upload-artifact@v6
-        with:
-          name: ui-build
-          path: tools/ui/dist/
-          retention-days: 1
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -1,70 +0,0 @@
-name: UI Publish
-
-on:
-  workflow_call:
-    inputs:
-      version_tag:
-        description: 'Version tag to publish under (e.g., b1234)'
-        required: true
-        type: string
-    secrets:
-      hf_token:
-        description: 'Hugging Face token with write access'
-        required: true
-
-jobs:
-  build:
-    name: Build static output
-    uses: ./.github/workflows/ui-build.yml
-
-  publish:
-    name: Publish UI Static Output
-    needs: build
-    runs-on: ubuntu-24.04-arm
-
-    permissions:
-      contents: read
-
-    env:
-      HF_BUCKET_NAME: ${{ vars.HF_BUCKET_UI_STATIC_OUTPUT }}
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 1
-
-      - name: Download UI build artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: ui-build
-          path: tools/ui/dist/
-
-      - name: Install Hugging Face Hub CLI
-        run: pip install -U huggingface_hub
-
-      - name: Authenticate with Hugging Face
-        run: hf auth login --token ${{ secrets.hf_token }}
-
-      - name: Sync built files to Hugging Face bucket (version tag)
-        run: |
-          # Upload the built files to the Hugging Face bucket under the release version
-          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
-
-      - name: Sync built files to Hugging Face bucket (latest)
-        run: |
-          # Also upload to the 'latest' directory for fallback downloads
-          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
-
-      - name: Verify upload
-        run: |
-          # List the files in the bucket to verify the upload
-          hf buckets list hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} -R -h
-
-      - name: Clean up root-level files
-        run: |
-          # Clean up any old root-level files from previous non-versioned deployments
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/index.html --yes 2>/dev/null || true
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.js --yes 2>/dev/null || true
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.css --yes 2>/dev/null || true
-          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/loading.html --yes 2>/dev/null || true
--- a/.github/workflows/ui-self-hosted.yml
+++ b/.github/workflows/ui-self-hosted.yml
@@ -1,118 +0,0 @@
-name: UI (self-hosted)
-
-# these are the same as ui.yml, but with self-hosted runners
-# the runners come with pre-installed Playwright browsers version: 1.56.1
-# the jobs are much lighter because they don't need to install node and playwright browsers
-
-on:
-  workflow_dispatch:
-    inputs:
-      sha:
-        description: 'Commit SHA1 to build'
-        required: false
-        type: string
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/ui-self-hosted.yml',
-      '.github/workflows/ui-build.yml',
-      'tools/ui/**.*',
-      'tools/server/tests/**.*'
-    ]
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/ui-self-hosted.yml',
-      '.github/workflows/ui-build.yml',
-      'tools/ui/**.*',
-      'tools/server/tests/**.*'
-    ]
-
-env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ui-build:
-    name: Build static output
-    uses: ./.github/workflows/ui-build.yml
-
-  ui-checks:
-    name: Checks
-    needs: ui-build
-    runs-on: [self-hosted, PLAYWRIGHT]
-    continue-on-error: true
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install dependencies
-        id: setup
-        run: npm ci
-        working-directory: tools/ui
-
-      - name: Run type checking
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run check
-        working-directory: tools/ui
-
-      - name: Run linting
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run lint
-        working-directory: tools/ui
-
-      - name: Run Client tests
-        if: ${{ always() }}
-        run: npm run test:client
-        working-directory: tools/ui
-
-      - name: Run Unit tests
-        if: ${{ always() }}
-        run: npm run test:unit
-        working-directory: tools/ui
-
-  e2e-tests:
-    name: E2E Tests
-    needs: ui-build
-    runs-on: [self-hosted, PLAYWRIGHT]
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Install dependencies
-        id: setup
-        run: npm ci
-        working-directory: tools/ui
-
-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/ui
-
-      - name: Build Storybook
-        if: ${{ always() }}
-        run: npm run build-storybook
-        working-directory: tools/ui
-
-      - name: Run UI tests
-        if: ${{ always() }}
-        run: npm run test:ui -- --testTimeout=60000
-        working-directory: tools/ui
-
-      - name: Run E2E tests
-        if: ${{ always() }}
-        run: npm run test:e2e
-        working-directory: tools/ui
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -3,20 +3,18 @@ name: Update Operations Documentation
 on:
    push:
        paths:
-            - '.github/workflows/update-ops-docs.yml'
            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'
    pull_request:
        paths:
-            - '.github/workflows/update-ops-docs.yml'
            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'

 jobs:
    update-ops-docs:
-        runs-on: [self-hosted, fast, ARM64]
+        runs-on: ubuntu-slim

        steps:
        - name: Checkout repository
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@
 /.vscode/
 /nppBackup

+
 # Coverage

 /gcovr-report/
@@ -73,7 +74,6 @@
 !/models/templates

 # Zig
-
 /zig-out/
 /zig-cache/

@@ -92,12 +92,9 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

-# Server Web UI temporary files (+ legacy directory)
-
+# Server Web UI temporary files
 /tools/server/webui/node_modules
 /tools/server/webui/dist
-/tools/ui/node_modules
-/tools/ui/dist

 # Python

@@ -105,16 +102,11 @@
 __pycache__/
 */poetry.lock
 poetry.toml
-poetry.lock
-uv.lock

 # Nix
-
-flake.lock
 /result

 # Test binaries
-
 /tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
@@ -130,7 +122,6 @@ flake.lock
 /tests/test-tokenizer-1-spm

 # Scripts
-
 !/scripts/install-oneapi.bat

 # Generated by scripts
@@ -139,24 +130,16 @@ flake.lock
 /wikitext-2-raw/

 # Test models for lora adapters
-
 /lora-tests

 # Local scripts
-
 /run-vim.sh
 /run-chat.sh
 /run-spec.sh
 /.ccache/

 # IDE
-
 /*.code-workspace
 /.windsurf/
 # emscripten
 a.out.*
-
-# AGENTS
-
-AGENTS.local.md
-.pi/SYSTEM.md
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -1,37 +0,0 @@
-You are a coding agent. Here are some very important rules that you must follow:
-
-General:
- Be very precise and concise when writing code, comments, explanations, etc.
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
- Don't try to build or run the code unless you are explicitly asked to do so
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
-
-Coding:
- When in doubt, always refer to the CONTRIBUTING.md file of the project
- When referencing issues or PRs in comments, use the format:
-  - C/C++ code: `// ref: <url>`
-  - Other (CMake, etc.): `# ref: <url>`
-
-Pull requests (PRs):
- New branch names are prefixed with "gg/"
- Before opening a pull request, ask the user to confirm the description
- When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
- Ask the user to tell you what model was used and write it in place of [MODEL]
- Always create the pull requests in draft mode
-
-Commits:
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
- Do not explicitly set the git author in commits - rely on the default git config
- Always use `--no-gpg-sign` when committing
- Never `git push` without explicit confirmation from the user
-
-Resources (read on demand):
- [CONTRIBUTING.md](CONTRIBUTING.md)
- [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md)
- [PEG parser](docs/development/parsing.md)
- [Auto parser](docs/autoparser.md)
- [Jinja engine](common/jinja/README.md)
- [PR template](.github/pull_request_template.md)
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,106 +5,77 @@
 >
 > Read more: [CONTRIBUTING.md](CONTRIBUTING.md)

-AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below).
+AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below)

 ---

 ## Guidelines for Contributors Using AI

-llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers.
+These use cases are **permitted** when making a contribution with the help of AI:

-Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly.
+- Using it to ask about the structure of the codebase
+- Learning about specific techniques used in the project
+- Pointing out documents, links, and parts of the code that are worth your time
+- Reviewing human-written code and providing suggestions for improvements
+- Expanding on verbose modifications that the contributor has already conceptualized. For example:
+    - Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places)
+    - Formatting code for consistency and readability
+    - Completing code segments based on established patterns
+    - Drafting documentation for project components with which the contributor is already familiar

-**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution.
+AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers.

-Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it.
+**All AI usage requires explicit disclosure**, except in these cases:

-This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions.
+- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind.
+- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself.
+- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself.

 ---

-## Guidelines for Contributors
+## Guidelines for AI Agents

-Contributors are expected to:
+### Permitted Usage

-1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes.
+As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by:

-2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback.
+- Explicitly informing them that AI-generated pull requests are not accepted by the project
+- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
+- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
+- Providing useful links and pointers found throughout the codebase

-3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected.
+Examples of valid questions:

-4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed.
+- "I have problem X; can you give me some clues?"
+- "How do I run the test?"
+- "Where is the documentation for server development?"
+- "Does this change have any side effects?"
+- "Review my changes and give me suggestions on how to improve them"

-Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.**
+### Forbidden Usage

-### Permitted AI Usage
+- DO NOT write code for contributors.
+- DO NOT generate entire PRs or large code blocks.
+- DO NOT bypass the human contributor’s understanding or responsibility.
+- DO NOT make decisions on their behalf.
+- DO NOT submit work that the contributor cannot explain or justify.

-AI tools may be used responsibly for:
+Examples of FORBIDDEN USAGE (and how to proceed):

- **Learning and exploration**: Understanding codebase structure, techniques, and documentation
- **Code review assistance**: Obtaining suggestions on human-written code
- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns
- **Documentation drafts**: For components the contributor already understands thoroughly
- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work
+- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do.
+- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves.

-AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance.
+If a user asks one of the above, STOP IMMEDIATELY and ask them:

-**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research.
+- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
+- To search for relevant issues and create a new one if needed

-### Prohibited AI Usage
+If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain.

-The following will result in immediate PR closure:
+## Related Documentation

- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time
- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review
- **Implementing features without understanding the codebase** - particularly new model support or architectural changes
- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans
-
---
-
-## Guidelines for AI Coding Agents
-
-AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project.
-
-### Considerations for Maintainer Workload
-
-Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify:
-
- The contributor genuinely understands the proposed changes
- The change addresses a documented need (check existing issues)
- The PR is appropriately scoped and follows project conventions
- The contributor can independently defend and maintain the work
-
-### Before Proceeding with Code Changes
-
-When a user requests implementation without demonstrating understanding:
-
-1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase.
-2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach.
-3. **Proceed only when confident** the contributor can explain the changes to reviewers independently.
-
-For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy.
-
-### Prohibited Actions
-
- Writing PR descriptions, commit messages, or responses to reviewers
- Committing or pushing without explicit human approval for each action
- Implementing features the contributor does not understand
- Generating changes too extensive for the contributor to fully review
-
-When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain.
-
-### Useful Resources
-
-To conserve context space, load these resources as needed:
+For related documentation on building, testing, and guidelines, please refer to:

 - [CONTRIBUTING.md](CONTRIBUTING.md)
- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first
 - [Build documentation](docs/build.md)
- [Server usage documentation](tools/server/README.md)
- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation)
- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output
- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features
- [Jinja engine](common/jinja/README.md)
- [How to add a new model](docs/development/HOWTO-add-model.md)
- [PR template](.github/pull_request_template.md)
+- [Server development documentation](tools/server/README-dev.md)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,16 +104,12 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})

 # extra artifacts
-option(LLAMA_BUILD_TESTS     "llama: build tests"                                                                ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS     "llama: build tools"                                                                ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES  "llama: build examples"                                                             ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER    "llama: build server example"                                                       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_UI        "llama: build the embedded Web UI for server"                                       ON)
-option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
-
-option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
+option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

 # 3rd party libs
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
@@ -218,10 +214,6 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
    add_subdirectory(tools)
 endif()

-if (LLAMA_BUILD_APP)
-    add_subdirectory(app)
-endif()
-
 # Automatically add all files from the 'licenses' directory
 file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")

@@ -232,7 +224,7 @@ foreach(FILE_PATH ${EXTRA_LICENSES})
 endforeach()

 if (LLAMA_BUILD_COMMON)
-    license_generate(llama-common)
+    license_generate(common)
 endif()

 #
@@ -256,10 +248,6 @@ set_target_properties(llama

 install(TARGETS llama LIBRARY PUBLIC_HEADER)

-if (LLAMA_BUILD_COMMON)
-    install(TARGETS llama-common LIBRARY)
-endif()
-
 configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
@@ -277,6 +265,18 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)

+install(
+    FILES convert_hf_to_gguf.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+
 configure_file(cmake/llama.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        @ONLY)
--- a/35
+++ b/35
@@ -1,21 +1,5 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
-# multiple collaborators per item can be specified
-#
-# ggml-org/ci               : CISC, danbev, ggerganov, netrunnereve, ngxson, taronaeo
-# ggml-org/ggml-cann        : hipudding
-# ggml-org/ggml-cuda        : JohannesGaessler, am17an, IMbackK, ORippler
-# ggml-org/ggml-hexagon     : lhez, max-krasnyansky
-# ggml-org/ggml-metal       : ggerganov
-# ggml-org/ggml-opencl      : lhez, max-krasnyansky
-# ggml-org/ggml-rpc         : rgerganov
-# ggml-org/ggml-sycl        : arthw
-# ggml-org/ggml-vulkan      : 0cc4m, jeffbolznv
-# ggml-org/ggml-webgpu      : reeselevine
-# ggml-org/ggml-zdnn        : taronaeo
-# ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
-# ggml-org/llama-mtmd       : ngxson
-# ggml-org/llama-server     : ggerganov, ngxson, allozaur, angt, ServeurpersoCom
-# ggml-org/llama-ui           : allozaur
+# multiplie collaborators per item can be specified

 /.devops/*.Dockerfile                   @ngxson
 /.github/actions/                       @ggml-org/ci
@@ -23,12 +7,9 @@
 /ci/                                    @ggerganov
 /cmake/                                 @ggerganov
 /common/                                @ggml-org/llama-common
-/common/fit.*                           @JohannesGaessler
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
-/conversion/                            @CISC
 /convert_*.py                           @CISC
-/docs/backend/snapdragon/               @ggml-org/ggml-hexagon
 /examples/batched.swift/                @ggerganov
 /examples/batched/                      @ggerganov
 /examples/convert-llama2c-to-ggml/      @ggerganov
@@ -49,34 +30,33 @@
 /examples/parallel/                     @ggerganov
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
+/examples/save-load-state/              @ggerganov
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
 /ggml/include/                          @ggerganov
-/ggml/src/ggml-backend-meta.cpp         @JohannesGaessler
 /ggml/src/ggml-cann/                    @ggml-org/ggml-cann
 /ggml/src/ggml-common.h                 @ggerganov
 /ggml/src/ggml-cpu/                     @ggerganov
 /ggml/src/ggml-cpu/spacemit/            @alex-spacemit
 /ggml/src/ggml-cuda/                    @ggml-org/ggml-cuda
-/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
-/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-hip/                     @IMbackK
+/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
 /ggml/src/ggml-impl.h                   @ggerganov
 /ggml/src/ggml-metal/                   @ggml-org/ggml-metal
 /ggml/src/ggml-opencl/                  @ggml-org/ggml-opencl
-/ggml/src/ggml-openvino/                @cavusmustafa @wine99
+/ggml/src/ggml-hexagon/                 @ggml-org/ggml-hexagon
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @ggml-org/ggml-rpc
 /ggml/src/ggml-sycl/                    @ggml-org/ggml-sycl
 /ggml/src/ggml-threading.*              @ggerganov
-/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-vulkan/                  @ggml-org/ggml-vulkan
+/ggml/src/ggml-virtgpu/                 @kpouget
 /ggml/src/ggml-webgpu/                  @ggml-org/ggml-webgpu
 /ggml/src/ggml-zdnn/                    @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml-zendnn/                  @avinashcpandey @Jiten1parmar @z-vishal
+/ggml/src/ggml-openvino/                @cavusmustafa @wine99
 /ggml/src/ggml.c                        @ggerganov
 /ggml/src/ggml.cpp                      @ggerganov
 /ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
@@ -85,7 +65,6 @@
 /scripts/gen*                           @ggerganov
 /scripts/get*                           @ggerganov
 /scripts/sync*                          @ggerganov
-/scripts/snapdragon/                    @ggml-org/ggml-hexagon
 /src/                                   @ggerganov
 /src/llama-adapter.*                    @CISC
 /src/llama-arch.*                       @CISC
@@ -107,7 +86,7 @@
 /tools/rpc/                             @ggml-org/ggml-rpc
 /tools/server/*                         @ggml-org/llama-server # no subdir
 /tools/server/tests/                    @ggml-org/llama-server
-/tools/ui/                              @ggml-org/llama-ui
+/tools/server/webui/                    @ggml-org/llama-webui
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,8 +11,6 @@ The project differentiates between 3 levels of contributors:
 > [!IMPORTANT]
 > This project does **not** accept pull requests that are fully or predominantly AI-generated. AI tools may be utilized solely in an assistive capacity.
 >
-> Repeated violations of this policy may result in your account being permanently banned from contributing to the project.
->
 > Detailed information regarding permissible and restricted uses of AI can be found in the [AGENTS.md](AGENTS.md) file.

 Code that is initially generated by AI and subsequently edited will still be considered AI-generated. AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (e.g., generating repeated lines with minor variations).
@@ -46,9 +44,7 @@ Before submitting your PR:
    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor
-    - Limit your open PRs to 1
-    - Do not submit trivial fixes (e.g. typos, formatting changes)
+- If you are a new contributor, limit your open PRs to 1.

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
@@ -65,10 +61,10 @@ After submitting your PR:
 - When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

-Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
+Maintainers reserve the right to decline review or close pull requests for any reason, particularly under any of the following conditions:
 - The proposed change is already mentioned in the roadmap or an existing issue, and it has been assigned to someone.
 - The pull request duplicates an existing one.
- The contributor fails to adhere to this contributing guide or the AI policy.
+- The contributor fails to adhere to this contributing guide.

 # Coding guidelines

@@ -182,8 +178,6 @@ Maintainers reserve the right to decline review or close pull requests for any r
 - New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
  _(NOTE: for legacy reasons, existing code is not required to follow this guideline)_

- For changes in server, please make sure to refer to the [server development documentation](./tools/server/README-dev.md)
-
 # Documentation

 - Documentation is a community effort
--- a/README.md
+++ b/README.md
@@ -17,7 +17,6 @@ LLM inference in C/C++

 ## Hot topics

- **Hugging Face cache migration: models downloaded with `-hf` are now stored in the standard Hugging Face cache directory, enabling sharing with other HF tools.**
 - **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
 - [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
 - [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
@@ -27,7 +26,6 @@ LLM inference in C/C++
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).

 ----

@@ -173,7 +171,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
- Ruby: [docusealco/rllama](https://github.com/docusealco/rllama)
 - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
@@ -244,7 +241,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 <details>
 <summary>Tools</summary>

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from Hugging Face Hub and convert them to GGML
+- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
@@ -281,7 +278,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Metal](docs/build.md#metal-build) | Apple Silicon |
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel GPU |
+| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
 | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
@@ -291,7 +288,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
-| [WebGPU](docs/build.md#webgpu) | All |
+| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
@@ -303,13 +300,13 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)

-You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:

 ```sh
 llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
 ```

-By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. The `MODEL_ENDPOINT` must point to a Hugging Face compatible API endpoint.
+By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.

 After downloading a model, use the CLI tools to run it locally - see below.

@@ -531,7 +528,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
 - [How to build](docs/build.md)
 - [Running on Docker](docs/docker.md)
 - [Build on Android](docs/android.md)
- [Multi-GPU usage](docs/multi-gpu.md)
 - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -1,20 +0,0 @@
-set(TARGET llama-app)
-
-add_executable(${TARGET} llama.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
-
-target_link_libraries(${TARGET} PRIVATE
-    llama-server-impl
-    llama-cli-impl
-    llama-completion-impl
-    llama-bench-impl
-    llama-batched-bench-impl
-    llama-fit-params-impl
-    llama-quantize-impl
-    llama-perplexity-impl
-)
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-if(LLAMA_TOOLS_INSTALL)
-    install(TARGETS ${TARGET} RUNTIME)
-endif()
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -1,95 +0,0 @@
-#include "build-info.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include <vector>
-
-// visible
-int llama_server(int argc, char ** argv);
-int llama_cli(int argc, char ** argv);
-
-// hidden
-int llama_completion(int argc, char ** argv);
-int llama_bench(int argc, char ** argv);
-int llama_batched_bench(int argc, char ** argv);
-int llama_fit_params(int argc, char ** argv);
-int llama_quantize(int argc, char ** argv);
-int llama_perplexity(int argc, char ** argv);
-
-static int help(int argc, char ** argv);
-static int version(int argc, char ** argv);
-
-struct command {
-    const char * name;
-    const char * desc;
-    std::vector<std::string> aliases;
-    bool hidden;
-    int (*func)(int, char **);
-};
-
-static const command cmds[] = {
-    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
-    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
-    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
-    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
-    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
-    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
-    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
-    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
-    {"version",       "Show version",                                       {},           true,  version            },
-    {"help",          "Show available commands",                            {},           true,  help               },
-};
-
-static int version(int argc, char ** argv) {
-    printf("%s\n", llama_build_info());
-    return 0;
-}
-
-static int help(int argc, char ** argv) {
-    const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
-
-    printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
-
-    for (const auto & cmd : cmds) {
-        if (show_all || !cmd.hidden) {
-            printf("  %-15s %s\n", cmd.name, cmd.desc);
-        }
-    }
-    printf("\nRun 'llama <command> --help' for command-specific usage.\n");
-
-    return 0;
-}
-
-static bool matches(const std::string & arg, const command & cmd) {
-    if (arg == cmd.name) {
-        return true;
-    }
-    for (const auto & alias : cmd.aliases) {
-        if (arg == alias) {
-            return true;
-        }
-    }
-    return false;
-}
-
-int main(int argc, char ** argv) {
-    const std::string arg = argc >= 2 ? argv[1] : "help";
-
-    for (const auto & cmd : cmds) {
-        if (matches(arg, cmd)) {
-
-            // router spawns children through this same binary, it needs the
-            // subcommand to relaunch as 'llama serve' and not bare options
-#ifdef _WIN32
-            _putenv_s("LLAMA_APP_CMD", cmd.name);
-#else
-            setenv("LLAMA_APP_CMD", cmd.name, 1);
-#endif
-            return cmd.func(argc - 1, argv + 1);
-        }
-    }
-
-    fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
-    return 1;
-}
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -7,7 +7,6 @@ VISIONOS_MIN_OS_VERSION=1.0
 TVOS_MIN_OS_VERSION=16.4

 BUILD_SHARED_LIBS=OFF
-LLAMA_BUILD_APP=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
@@ -32,7 +31,6 @@ COMMON_CMAKE_ARGS=(
    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
-    -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -25,13 +25,7 @@
 # # with KLEIDIAI support
 # GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with BLAS support
-# GG_BUILD_BLAS=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# with BLAS support (custom vendor)
-# GG_BUILD_BLAS=1 GG_BUILD_BLAS_VENDOR=Intel10_64lp bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# with OPENVINO support
+# # with OPENVINO support
 # GG_BUILD_OPENVINO=1 GG_BUILD_LOW_PERF=1 GGML_OPENVINO_DEVICE=CPU bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #

@@ -57,13 +51,6 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=${LLAMA_FATAL_WARNINGS:-ON} -DLLAMA_OPENSSL=OFF -DGGML_SCHED_NO_REALLOC=ON"
 CTEST_EXTRA=""

-# Default to use make unless specified for compatibility
-CMAKE_GENERATOR="Unix Makefiles"
-
-if [ ! -z "${GG_BUILD_NINJA}" ]; then
-    CMAKE_GENERATOR="Ninja"
-fi
-
 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi
@@ -117,19 +104,8 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
    # if on Mac, disable METAL
    if [[ "$OSTYPE" == "darwin"* ]]; then
        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
-
-        MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
-        MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
-        if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
-            CMAKE_EXTRA="${CMAKE_EXTRA} -DSPIRV-Headers_DIR=${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers"
-        fi
    fi

-    # Build shared libs on Windows
-    # to reduce binary size and avoid errors in library loading unit tests
-    if uname -s | grep -qi nt; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DBUILD_SHARED_LIBS=ON"
-    fi
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@@ -162,11 +138,35 @@ fi

 if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
    echo ">>===== Enabling KleidiAI support"
-    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
-fi

-if [ ! -z ${GG_BUILD_BLAS} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
+    CANDIDATES=(
+        "armv9-a+dotprod+i8mm+sve2"
+        "armv9-a+dotprod+i8mm"
+        "armv8.6-a+dotprod+i8mm"
+        "armv8.2-a+dotprod"
+    )
+    CPU=""
+
+    for cpu in "${CANDIDATES[@]}"; do
+        if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
+            CPU="$cpu"
+            break
+        fi
+    done
+
+    if [ -z "$CPU" ]; then
+        echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
+        exit 1
+    fi
+
+    echo ">>===== Using ARM baseline: ${CPU}"
+
+    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CPU_KLEIDIAI=ON \
+        -DGGML_CPU_AARCH64=ON \
+        -DGGML_CPU_ARM_ARCH=${CPU} \
+        -DBUILD_SHARED_LIBS=OFF"
 fi

 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
@@ -232,13 +232,13 @@ function gg_run_ctest_debug {

    set -e

-    # Check required binaries are installed
+    # Check cmake, make and ctest are installed
    gg_check_build_requirements

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -263,16 +263,16 @@ function gg_run_ctest_release {

    set -e

-    # Check required binaries are installed
+    # Check cmake, make and ctest are installed
    gg_check_build_requirements

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest -C Release --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L 'main|python' ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
-        (time ctest -C Release --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L main -E test-opt ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi

    set +e
@@ -330,7 +330,7 @@ function gg_run_ctest_with_model_debug {
    cd build-ci-debug
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Debug --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
    cd ..
@@ -343,7 +343,7 @@ function gg_run_ctest_with_model_release {
    cd build-ci-release
    set -e

-    (LLAMACPP_TEST_MODELFILE="$model" time ctest -C Release --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log

    # test memory leaks
    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
@@ -397,8 +397,8 @@ function gg_run_qwen3_0_6b {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
@@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b {

    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -546,8 +546,8 @@ function gg_run_embd_bge_small {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -591,8 +591,8 @@ function gg_run_rerank_tiny {

    set -e

-    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time cmake --build . --config Release -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

@@ -638,36 +638,12 @@ function gg_sum_rerank_tiny {
 }

 function gg_check_build_requirements {
-    if ! command -v git &> /dev/null; then
-        gg_printf 'git not found, please install'
-    fi
-
-    if ! command -v git-lfs &> /dev/null; then
-        gg_printf 'git-lfs not found, please install'
-    fi
-
-    if ! command -v wget &> /dev/null; then
-        gg_printf 'wget not found, please install'
-    fi
-
-    if ! command -v python3 &> /dev/null; then
-        gg_printf 'python3 not found, please install'
-    fi
-
-    if ! command -v pip3 &> /dev/null; then
-        gg_printf 'pip3 not found, please install'
-    fi
-
-    if ! python3 -m ensurepip --help &> /dev/null; then
-        gg_printf 'ensurepip not found, please install python3-venv package'
-    fi
-
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
    fi

-    if ! command -v ccache &> /dev/null; then
-        gg_printf 'ccache not found, please consider installing for faster builds'
+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
    fi

    if ! command -v ctest &> /dev/null; then
--- a/cmake/arm64-linux-clang.cmake
+++ b/cmake/arm64-linux-clang.cmake
@@ -1,17 +0,0 @@
-set( CMAKE_SYSTEM_NAME Linux )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target aarch64-linux-gnu )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
-
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -7,7 +7,7 @@ set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")

 find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)

--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -24,6 +24,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
 set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zvfh_zba_zicbop -mabi=lp64d -fno-tree-vectorize -fno-tree-loop-vectorize ${CMAKE_CXX_FLAGS}")
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,11 +1,9 @@
+# common
+
 find_package(Threads REQUIRED)

 llama_add_compile_flags()

-#
-# llama-common-base
-#
-
 # Build info header

 if(EXISTS "${PROJECT_SOURCE_DIR}/.git")
@@ -35,25 +33,17 @@ endif()

 set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in")
 set(OUTPUT_FILE   "${CMAKE_CURRENT_BINARY_DIR}/build-info.cpp")
-
 configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})

-set(TARGET llama-common-base)
-add_library(${TARGET} STATIC ${OUTPUT_FILE})
-
-target_include_directories(${TARGET} PUBLIC .)
-
+set(TARGET build_info)
+add_library(${TARGET} OBJECT ${OUTPUT_FILE})
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-#
-# llama-common
-#
+set(TARGET common)

-set(TARGET llama-common)
-
-add_library(${TARGET}
+add_library(${TARGET} STATIC
    arg.cpp
    arg.h
    base64.hpp
@@ -73,10 +63,6 @@ add_library(${TARGET}
    debug.h
    download.cpp
    download.h
-    fit.cpp
-    fit.h
-    hf-cache.cpp
-    hf-cache.h
    http.h
    json-partial.cpp
    json-partial.h
@@ -118,24 +104,17 @@ add_library(${TARGET}
    jinja/caps.h
    )

-set_target_properties(${TARGET} PROPERTIES
-    VERSION ${LLAMA_INSTALL_VERSION}
-    SOVERSION 0
-    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
-)
-
 target_include_directories(${TARGET} PUBLIC . ../vendor)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)

 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-    # TODO: make fine-grained exports in the future
-    set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()

-target_link_libraries(${TARGET} PUBLIC  llama-common-base)
-target_link_libraries(${TARGET} PRIVATE cpp-httplib)
+target_link_libraries(${TARGET} PRIVATE
+    build_info
+    cpp-httplib
+)

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -25,8 +25,7 @@ struct common_arg {
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
    std::string help;
-    bool is_sampling = false; // is current arg a sampling param?
-    bool is_spec = false; // is current arg a speculative decoding param?
+    bool is_sparam = false; // is current arg a sampling param?
    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
    void (*handler_void)   (common_params & params) = nullptr;
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
@@ -75,8 +74,7 @@ struct common_arg {
    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
    common_arg & set_env(const char * env);
-    common_arg & set_sampling();
-    common_arg & set_spec();
+    common_arg & set_sparam();
    common_arg & set_preset_only();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
@@ -129,8 +127,5 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
 // see: https://github.com/ggml-org/llama.cpp/issues/18163
 void common_params_add_preset_options(std::vector<common_arg> & args);

-// Populate model paths (main model, mmproj, etc) from -hf if necessary
-void common_params_handle_models(common_params & params, llama_example curr_ex);
-
 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,35 +1,4 @@
-#include "build-info.h"
-
-#include <cstdio>
-#include <string>
-
 int LLAMA_BUILD_NUMBER = @LLAMA_BUILD_NUMBER@;
-char const * LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
-char const * LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const * LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
-
-int llama_build_number(void) {
-    return LLAMA_BUILD_NUMBER;
-}
-
-const char * llama_commit(void) {
-    return LLAMA_COMMIT;
-}
-
-const char * llama_compiler(void) {
-    return LLAMA_COMPILER;
-}
-
-const char * llama_build_target(void) {
-    return LLAMA_BUILD_TARGET;
-}
-
-const char * llama_build_info(void) {
-    static std::string s = "b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT;
-    return s.c_str();
-}
-
-void llama_print_build_info(void) {
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, llama_build_number(), llama_commit());
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, llama_compiler(), llama_build_target());
-}
+char const *LLAMA_COMMIT = "@LLAMA_BUILD_COMMIT@";
+char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
+char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/build-info.h
+++ b/common/build-info.h
@@ -1,11 +0,0 @@
-#pragma once
-
-int llama_build_number(void);
-
-const char * llama_commit(void);
-const char * llama_compiler(void);
-
-const char * llama_build_target(void);
-const char * llama_build_info(void);
-
-void llama_print_build_info(void);
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -1,4 +1,3 @@
-#include "chat-auto-parser-helpers.h"
 #include "chat-auto-parser.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
@@ -6,7 +5,6 @@
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
-#include "peg-parser.h"

 #include <stdexcept>
 #include <string>
@@ -25,13 +23,13 @@ static void foreach_function(const json & tools, const std::function<void(const

 namespace autoparser {

-parser_build_context::parser_build_context(common_chat_peg_builder & p, const generation_params & inputs) :
+parser_build_context::parser_build_context(common_chat_peg_builder & p, const templates_params & inputs) :
    p(p),
    inputs(inputs),
    reasoning_parser(p.eps()) {}

 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct generation_params & inputs) {
+                                                  const struct templates_params & inputs) {
    // Run differential analysis to extract template structure
    struct autoparser autoparser;
    autoparser.analyze_template(tmpl);
@@ -39,38 +37,17 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
 }

 common_chat_params peg_generator::generate_parser(const common_chat_template &    tmpl,
-                                                  const struct generation_params & inputs,
+                                                  const struct templates_params & inputs,
                                                  const autoparser &              autoparser) {
+    // Build the parser using the analysis results
+    auto parser = autoparser.build_parser(inputs);
+
    // Create the result structure
    common_chat_params data;
-    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
-    data.generation_prompt = common_chat_template_generation_prompt(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens  = autoparser.preserved_tokens;
-
-    std::string parser_generation_prompt = data.generation_prompt;
-
-    if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !inputs.continue_msg.empty()) {
-        // Build up generation prompt manually
-        const auto & msg = inputs.continue_msg;
-
-        if (!autoparser.reasoning.start.empty()) {
-            data.generation_prompt = data.generation_prompt.substr(0, data.generation_prompt.find(autoparser.reasoning.start));
-            data.generation_prompt += autoparser.reasoning.start + msg.reasoning_content;
-            if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-                data.generation_prompt += autoparser.reasoning.end;
-            }
-        }
-
-        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
-            data.generation_prompt += msg.render_content();
-        }
-
-        data.prompt += data.generation_prompt;
-    }
-
-    auto parser = autoparser.build_parser(inputs, parser_generation_prompt);
-    data.parser = parser.save();
+    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
+    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens = autoparser.preserved_tokens;
+    data.parser           = parser.save();

    // Build grammar if tools are present
    bool has_tools =
@@ -88,13 +65,9 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
-                auto         schema   = function.contains("parameters") ? function.at("parameters") : json::object();
+                auto         schema   = function.at("parameters");
                builder.resolve_refs(schema);
            });
-            if (has_response_format) {
-                auto schema = inputs.json_schema;
-                builder.resolve_refs(schema);
-            }
            parser.build_grammar(builder, data.grammar_lazy);
        });

@@ -109,41 +82,44 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    return data;
 }

-common_peg_arena autoparser::build_parser(const generation_params & inputs, const std::string & generation_prompt) const {
+common_peg_arena autoparser::build_parser(const templates_params & inputs) const {
    if (!analysis_complete) {
        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
    }
    return build_chat_peg_parser([&](common_chat_peg_builder & p) {
+        // If the template uses Python dict format (single-quoted strings in JSON structures),
+        // pre-register a json-string rule that accepts both quote styles. This must happen
+        // before any call to p.json() so that all JSON parsing inherits the flexible rule.
+        if (tools.format.uses_python_dicts) {
+            p.rule("json-string", p.quoted_string());
+        }
+
        parser_build_context ctx(p, inputs);
        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
+        bool                 enable_thinking   = inputs.enable_thinking;

-        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
+        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
        ctx.content              = &content;
-        ctx.reasoning            = &reasoning;

        // Build reasoning parser
        ctx.reasoning_parser = reasoning.build_parser(ctx);

-        auto parser = p.eps();
-
        bool has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
        bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
-        bool pure_content        = reasoning.mode == reasoning_mode::NONE;

        if (has_response_format) {
            auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
-            parser = ctx.reasoning_parser + p.space() + p.choice({
+            return ctx.reasoning_parser + p.space() + p.choice({
                p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
                response_format
            }) + p.end();
-            pure_content = false;
-        } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
-            parser = tools.build_parser(ctx);
-            pure_content = false;
-        } else {
-            parser = content.build_parser(ctx);
        }
-        return pure_content ? p.prefix(generation_prompt, reasoning.start) + parser : p.prefix(generation_prompt, reasoning.start) << parser;
+
+        if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
+            return tools.build_parser(ctx);
+        }
+
+        return content.build_parser(ctx);
    });
 }

@@ -154,15 +130,24 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        return p.eps();
    }

+    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
+    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
+
+    if (thinking_forced_open || thinking_forced_closed) {
+        // Thinking is forced open OR forced closed with enable_thinking=true
+        // In both cases, expect only the closing tag (opening was in template)
+        // However, since we might have incorrectly detected the open/close pattern,
+        // we admit an optional starting marker
+        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
+    }
    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        if (!end.empty()) {
-            if (!start.empty()) {
-                // Standard tag-based: optional(<think>reasoning</think>)
-                return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
-            }
-            // Delimiter-style (empty start)
-            return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
+        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
+        // Both use the same tag-based pattern if markers are available
+        if (!start.empty() && !end.empty()) {
+            return p.optional(start + p.reasoning(p.until(end)) + end);
        }
+    } else if (mode == reasoning_mode::DELIMITER) {
+        return p.optional(p.reasoning(p.until(end)) + end);
    }

    return p.eps();
@@ -208,6 +193,7 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
 common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    // Build effective field names with dot notation if function_field is set
    std::string name_field = format.name_field;
@@ -219,19 +205,10 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        args_field = format.function_field + "." + args_field;
    }

-    auto tools_parser = p.eps();
-    if (format.section_start.empty() && !format.per_call_start.empty()) {
-        auto single_tool_parser = p.standard_json_tools(
-            format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls,
-            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
-        tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space()));
-    } else {
-        tools_parser = p.standard_json_tools(
-            format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
-            inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
-            format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);
-    }
+    auto tools_parser = p.standard_json_tools(
+        format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls,
+        inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped,
+        format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order);

    // Handle content wrappers if present
    if (ctx.content && ctx.content->is_always_wrapped()) {
@@ -246,80 +223,34 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
        tool_start = format.per_call_start;
    }

-    return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
-}
-
-common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
-                                                    const common_peg_parser & call_id_section, bool have_call_id,
-                                                    const common_peg_parser & args,
-                                                    std::optional<common_peg_parser> atomic_peek) const {
-    auto              open           = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix);
-    bool              matched_atomic = false;
-    common_peg_parser func_parser    = p.eps();
-
-    if (!function.name_suffix.empty()) {
-        func_parser    = open + call_id_section + p.space() + args;
-        matched_atomic = true;
-    } else if (have_call_id) {
-        func_parser    = p.atomic(open + call_id_section) + p.space() + args;
-        matched_atomic = true;
-    } else if (atomic_peek.has_value()) {
-        func_parser    = p.atomic(open + call_id_section + p.space() + *atomic_peek) + args;
-        matched_atomic = true;
-    } else {
-        func_parser = open + call_id_section + p.space() + args;
-    }
-
-    if (!function.close.empty()) {
-        func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
-    } else if (!format.per_call_end.empty()) {
-        // When there's no func_close but there is a per_call_end marker, use peek() to ensure
-        // we only emit tool_close when we can actually see the closing marker. This prevents
-        // premature closing during partial parsing when we've seen e.g. "</" which could be
-        // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
-        func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
-    } else {
-        func_parser = func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
-    }
-    if (!matched_atomic) {
-        func_parser = p.atomic(func_parser);
-    }
-    return func_parser;
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
+           p.end();
 }

 common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    common_peg_parser tool_choice = p.choice();

    foreach_function(inputs.tools, [&](const json & tool) {
        const auto & func   = tool.at("function");
        std::string  name   = func.at("name");
-        const auto & schema = func.contains("parameters") ? func.at("parameters") : json::object();
+        const auto & schema = func.at("parameters");

        // Build call_id parser based on position (if supported)
-        bool have_call_id = false;
        common_peg_parser call_id_section = p.eps();
        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            (!call_id.suffix.empty() || !arguments.start.empty())) {
-            if (!call_id.suffix.empty()) {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
-            } else {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
-            }
-            have_call_id = true;
-        }
-        auto args_parser = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
-        if (!arguments.start.empty()) {
-            args_parser = p.literal(arguments.start) + args_parser;
-        }
-        if (!arguments.end.empty()) {
-            args_parser = args_parser + p.literal(arguments.end);
+            !call_id.suffix.empty()) {
+            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix))) + call_id.suffix;
        }

-        auto atomic_peek = !arguments.start.empty() ? std::optional(p.peek(p.literal(arguments.start))) : std::nullopt;
-        auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_parser, atomic_peek);
+        auto func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
+                           call_id_section + p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", schema));
+        if (!function.close.empty()) {
+            func_parser = func_parser + function.close;
+        }
        tool_choice |= p.rule("tool-" + name, func_parser);
    });

@@ -355,47 +286,58 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+           p.end();
 }

 common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
    auto &       p           = ctx.p;
    const auto & inputs      = ctx.inputs;
-
-    auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
+    bool         force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;

    common_peg_parser tool_choice = p.choice();

    foreach_function(inputs.tools, [&](const json & tool) {
-        const auto &          func       = tool.at("function");
-        std::string           name       = func.at("name");
-        auto                  params     = func.contains("parameters") ? func.at("parameters") : json::object();
-        const auto &          properties = params.contains("properties") ? params.at("properties") : json::object();
+        const auto & func   = tool.at("function");
+        std::string  name   = func.at("name");
+        const auto & params = func.at("parameters");

-        std::set<std::string> required;
-        if (params.contains("required")) {
-            params.at("required").get_to(required);
+        if (!params.contains("properties") || !params.at("properties").is_object()) {
+            return;
        }

-        auto schema_info = common_schema_info();
-        schema_info.resolve_refs(params);
+        const auto &          properties = params.at("properties");
+        std::set<std::string> required;
+        if (params.contains("required") && params.at("required").is_array()) {
+            params.at("required").get_to(required);
+        }

        // Build parser for each argument, separating required and optional
        std::vector<common_peg_parser> required_parsers;
        std::vector<common_peg_parser> optional_parsers;
        for (const auto & [param_name, param_schema] : properties.items()) {
-            bool is_required = required.find(param_name) != required.end();
+            bool        is_required = required.find(param_name) != required.end();
+            std::string type        = "object";
+            auto        type_obj    = param_schema.contains("type") ? param_schema.at("type") : json::object();
+            if (type_obj.is_string()) {
+                type_obj.get_to(type);
+            } else if (type_obj.is_object()) {
+                if (type_obj.contains("type") && type_obj.at("type").is_string()) {
+                    type_obj.at("type").get_to(type);
+                }
+            }

-            auto arg =
-                p.tool_arg(p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
-                                           arguments.name_suffix) +
-                           arguments.value_prefix +
-                           (schema_info.resolves_to_string(param_schema) ?
-                                p.tool_arg_string_value(until_suffix) :
-                                p.tool_arg_json_value(p.schema(
-                                    p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
-                                    p.space()) +
-                           p.tool_arg_close(p.literal(arguments.value_suffix)));
+            auto arg = p.tool_arg(
+                p.tool_arg_open(arguments.name_prefix + p.tool_arg_name(p.literal(param_name)) +
+                                arguments.name_suffix) +
+                arguments.value_prefix +
+                (type == "string" ? p.tool_arg_string_value(p.schema(p.until(arguments.value_suffix),
+                                                                     "tool-" + name + "-arg-" + param_name + "-schema",
+                                                                     param_schema, true)) :
+                                    p.tool_arg_json_value(p.schema(
+                                        p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, format.uses_python_dicts)) +
+                                        p.space()) +
+                p.tool_arg_close(p.literal(arguments.value_suffix)));

            auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg);
            if (is_required) {
@@ -420,34 +362,53 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
            for (const auto & opt : optional_parsers) {
                any_opt |= opt;
            }
-            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, -1);
-        }
-
-        if (!arguments.start.empty()) {
-            args_seq = p.literal(arguments.start) + args_seq;
-        }
-        if (!arguments.end.empty()) {
-            args_seq = args_seq + p.literal(arguments.end);
+            args_seq = args_seq + p.repeat(p.space() + any_opt, 0, (int) optional_parsers.size());
        }

        // Build call_id parser based on position (if supported)
        common_peg_parser call_id_section = p.eps();
        bool have_call_id = false;
        if (call_id.pos == call_id_position::BETWEEN_FUNC_AND_ARGS && !call_id.prefix.empty() &&
-            (!call_id.suffix.empty() || !arguments.start.empty())) {
+            !call_id.suffix.empty()) {
            have_call_id = true;
-            if (!call_id.suffix.empty()) {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
-            } else {
-                call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(arguments.start)));
-            }
+            call_id_section = p.optional(call_id.prefix + p.tool_id(p.until(call_id.suffix)) + call_id.suffix);
+        }
+
+        bool matched_atomic = false;
+        common_peg_parser func_parser = p.eps();
+        if (!function.name_suffix.empty()) {
+            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
+                call_id_section + p.space() + args_seq;
+            matched_atomic = true;
+        } else if (have_call_id) {
+            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
+                call_id_section) + p.space() + args_seq;
+            matched_atomic = true;
+        } else if (!arguments.name_prefix.empty() && properties.size() > 0) {
+            func_parser = p.atomic(p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
+                call_id_section + p.space() + p.peek(p.literal(arguments.name_prefix))) + args_seq;
+            matched_atomic = true;
+        } else {
+            func_parser = p.tool_open(function.name_prefix + p.tool_name(p.literal(name)) + function.name_suffix) +
+                call_id_section + p.space() + args_seq;
+        }
+
+        if (!function.close.empty()) {
+            func_parser = func_parser + p.space() + p.tool_close(p.literal(function.close));
+        } else if (!format.per_call_end.empty()) {
+            // When there's no func_close but there is a per_call_end marker, use peek() to ensure
+            // we only emit tool_close when we can actually see the closing marker. This prevents
+            // premature closing during partial parsing when we've seen e.g. "</" which could be
+            // either "</tool_call>" (end) or "<arg_key>" prefix that failed to match.
+            func_parser = func_parser + p.tool_close(p.peek(p.literal(format.per_call_end)));
+        } else {
+            func_parser =
+                func_parser + p.tool_close(p.space());  // force this to process tool closing callbacks in mapper
+        }
+        if (!matched_atomic) {
+            func_parser = p.atomic(func_parser);
        }

-        // Only peek for an arg tag when there are required args that must follow.
-        // When all args are optional, the model may emit no arg tags at all (#20650).
-        auto atomic_peek = (!arguments.name_prefix.empty() && !required_parsers.empty()) ?
-            std::optional(p.peek(p.literal(arguments.name_prefix))) : std::nullopt;
-        auto func_parser = build_func_parser(p, name, call_id_section, have_call_id, args_seq, atomic_peek);
        tool_choice |= p.rule("tool-" + name, func_parser);
    });

@@ -458,14 +419,14 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
    if (!format.per_call_start.empty()) {
        auto wrapped_call = format.per_call_start + p.space() + tool_choice + p.space() + format.per_call_end;
        if (inputs.parallel_tool_calls) {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call) + p.space());
+            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.zero_or_more(p.space() + wrapped_call));
        } else {
-            tool_calls = p.trigger_rule("tool-call", wrapped_call + p.space());
+            tool_calls = p.trigger_rule("tool-call", wrapped_call);
        }
        if (!format.section_start.empty()) {
            tool_calls = p.trigger_rule("tool-calls",
                                        p.literal(format.section_start) + p.space() + tool_calls + p.space() +
-                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end) + p.space()));
+                                            (format.section_end.empty() ? p.end() : p.literal(format.section_end)));
        }
    } else {
        std::string separator = ", ";  // Default
@@ -486,7 +447,8 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte

    std::string trigger_marker       = !format.section_start.empty() ? format.section_start : format.per_call_start;
    auto        content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
-    return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
+    return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
+           p.end();
 }

 }  // namespace autoparser
--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -1,11 +1,9 @@
 #include "chat-auto-parser-helpers.h"

 #include "chat-auto-parser.h"
-#include "chat-peg-parser.h"
 #include "chat.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
-#include "peg-parser.h"

 #include <cctype>
 #include <numeric>
@@ -188,21 +186,6 @@ diff_split calculate_diff_split(const std::string & left, const std::string & ri
        result.suffix = "";
        // pick prefix = all as representation
    }
-
-    // When left has no unique content (result.left is empty), left is entirely
-    // shared with right. The simultaneous prefix/suffix segment matching can
-    // incorrectly consume trailing segments of left as suffix when those same
-    // segments also appear at the end of right (e.g. "\n" at the end of both
-    // the shared content and the generation prompt). This rotates the diff.
-    // Fix: if left is a prefix of right, enforce that directly.
-    if (result.left.empty() && !result.right.empty() &&
-            left.size() <= right.size() &&
-            right.substr(0, left.size()) == left) {
-        result.prefix = left;
-        result.suffix = "";
-        result.right  = right.substr(left.size());
-    }
-
    return result;
 }

@@ -310,10 +293,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm

 namespace autoparser {

-static const std::string ERR_TMPL = "#**ERROR**#";
-
 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
-    generation_params tmpl_params;
+    templates_params tmpl_params;
    tmpl_params.messages              = params.messages;
    tmpl_params.tools                 = params.tools;
    tmpl_params.add_generation_prompt = params.add_generation_prompt;
@@ -328,7 +309,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
        return common_chat_template_direct_apply(tmpl, tmpl_params);
    } catch (const std::exception & e) {
        LOG_DBG("Template application failed: %s\n", e.what());
-        return ERR_TMPL;
+        return "";
    }
 }

@@ -349,7 +330,7 @@ std::optional<compare_variants_result> compare_variants(
    std::string output_B = apply_template(tmpl, params_B);

    // Check for template application failures
-    if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
+    if (output_A.empty() || output_B.empty()) {
        return std::nullopt;
    }

--- a/common/chat-auto-parser-helpers.h
+++ b/common/chat-auto-parser-helpers.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "chat-auto-parser.h"
-
 #include <functional>
 #include <optional>
 #include <string>
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -4,7 +4,6 @@
 #include "common.h"
 #include "jinja/caps.h"
 #include "peg-parser.h"
-#include "nlohmann/json.hpp"

 #include <chrono>
 #include <optional>
@@ -51,7 +50,7 @@ namespace autoparser {
 // High-level params for parser generation
 // ============================================================================

-struct generation_params {
+struct templates_params {
    json                                  messages;
    json                                  tools;
    common_chat_tool_choice               tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -60,21 +59,15 @@ struct generation_params {
    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_AUTO;
    bool                                  stream              = true;
    std::string                           grammar;
-    bool                                  add_generation_prompt  = false;
-    common_chat_continuation              continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
-    common_chat_msg                       continue_msg;
-    bool                                  enable_thinking        = true;
-    std::chrono::system_clock::time_point now                    = std::chrono::system_clock::now();
+    bool                                  add_generation_prompt = false;
+    bool                                  enable_thinking       = true;
+    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
    json                                  extra_context;
    bool                                  add_bos       = false;
    bool                                  add_eos       = false;
    bool                                  is_inference  = true;
    bool                                  add_inference = false;
    bool                                  mark_input    = true;  // whether to mark input strings in the jinja context
-
-    bool has_continuation() const {
-        return continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !continue_msg.empty();
-    }
 };

 // ============================================================================
@@ -84,7 +77,11 @@ struct generation_params {
 // Reasoning handling mode (derived from R1-R3 comparisons)
 enum class reasoning_mode {
    NONE,           // No reasoning markers detected
-    TAG_BASED,      // Tag-based: <think>...</think> (start can be empty for delimiter-style)
+    TAG_BASED,      // Standard tag-based: <think>...</think>
+    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
+    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
+    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
+                    // with both opened and closed tag for disabled thinking
    TOOLS_ONLY      // Only reason on tool calls, not on normal content
 };

@@ -94,6 +91,12 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
            return os << "NONE";
        case reasoning_mode::TAG_BASED:
            return os << "TAG_BASED";
+        case reasoning_mode::DELIMITER:
+            return os << "DELIMITER";
+        case reasoning_mode::FORCED_OPEN:
+            return os << "FORCED_OPEN";
+        case reasoning_mode::FORCED_CLOSED:
+            return os << "FORCED_CLOSED";
        case reasoning_mode::TOOLS_ONLY:
            return os << "TOOLS_ONLY";
        default:
@@ -181,6 +184,7 @@ struct tool_format_analysis {

    bool fun_name_is_key = false;       // In JSON format function name is JSON key, i.e. { "<funname>": { ... arguments ... } }
    bool tools_array_wrapped = false;   // Tool calls wrapped in JSON array [...]
+    bool uses_python_dicts = false;     // Tool call args use Python dict format (single-quoted strings)

    std::string              function_field = "function";
    std::string              name_field     = "name";
@@ -218,17 +222,15 @@ struct tool_id_analysis {
 // ============================================================================

 struct analyze_content;
-struct analyze_reasoning;

 struct parser_build_context {
    common_chat_peg_builder & p;
-    const generation_params &         inputs;
+    const templates_params &          inputs;
    common_peg_parser                 reasoning_parser;
    bool                              extracting_reasoning = false;
-    const analyze_reasoning *         reasoning            = nullptr;
    const analyze_content *           content              = nullptr;

-    parser_build_context(common_chat_peg_builder & p, const generation_params & inputs);
+    parser_build_context(common_chat_peg_builder & p, const templates_params & inputs);
 };

 // ============================================================================
@@ -258,7 +260,6 @@ struct analyze_reasoning : analyze_base {

    analyze_reasoning() = default;
    analyze_reasoning(const common_chat_template & tmpl, bool supports_tools);
-    analyze_reasoning(std::string start_, std::string end_) : start(std::move(start_)), end(std::move(end_)) {}

    common_peg_parser build_parser(parser_build_context & ctx) const override;

@@ -313,23 +314,19 @@ struct analyze_tools : analyze_base {

  private:
    // Extract tool calling 'haystack' for further analysis and delegate further analysis based on format
-    void analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls);
+    void analyze_tool_calls(const analyze_reasoning & reasoning);

    // Analyze format based on position of function and argument name in needle
    void analyze_tool_call_format(const std::string &       haystack,
                                  const std::string &       fun_name_needle,
                                  const std::string &       arg_name_needle,
-                                  const analyze_reasoning & reasoning,
-                                  bool                      supports_parallel_tool_calls);
+                                  const analyze_reasoning & reasoning);

    // Analyze specifics of JSON native format (entire tool call is a JSON object)
    void analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                              const std::string & fun_name_needle,
                                              const std::string & arg_name_needle);

-    // Check if parallel calls in JSON native format array wrapped or tag wrapped
-    void analyze_json_native_parallel_calls();
-
    // Analyze specifics of non-JSON native format (tags for function name or for function name and arguments)
    void analyze_tool_call_format_non_json(const std::string & clean_haystack,
                                           const std::string & fun_name_needle);
@@ -362,13 +359,6 @@ struct analyze_tools : analyze_base {
    common_peg_parser build_tool_parser_json_native(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_json(parser_build_context & ctx) const;
    common_peg_parser build_tool_parser_tag_tagged(parser_build_context & ctx) const;
-
-    // Shared helper: builds func_parser from open+call_id+args, handling atomic wrapping and close.
-    // atomic_peek: if present, used as the peek expression in the third atomicity branch.
-    common_peg_parser build_func_parser(common_chat_peg_builder & p, const std::string & name,
-                                        const common_peg_parser & call_id_section, bool have_call_id,
-                                        const common_peg_parser & args,
-                                        std::optional<common_peg_parser> atomic_peek) const;
 };

 // ============================================================================
@@ -377,8 +367,6 @@ struct analyze_tools : analyze_base {

 struct autoparser {
    jinja::caps          jinja_caps;
-    std::string          user_start;
-    std::string          assistant_start;
    analyze_reasoning    reasoning;
    analyze_content      content;
    analyze_tools        tools;
@@ -389,15 +377,11 @@ struct autoparser {

    autoparser() = default;

-    // Find the starting marker for the user message and assistant message
-    std::string detect_user_start_marker(const common_chat_template & tmpl);
-    std::string detect_assistant_start_marker(const common_chat_template & tmpl);
-
    // Run full differential analysis on a template
    void analyze_template(const common_chat_template & tmpl);

    // Build the PEG parser for this template
-    common_peg_arena build_parser(const generation_params & inputs, const std::string & generation_prompt) const;
+    common_peg_arena build_parser(const templates_params & inputs) const;

  private:
    // Collect tokens from entire analysis to preserve
@@ -411,10 +395,10 @@ struct autoparser {
 class peg_generator {
  public:
    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct generation_params & inputs);
+                                              const struct templates_params & inputs);

    static common_chat_params generate_parser(const common_chat_template &    tmpl,
-                                              const struct generation_params & inputs,
+                                              const struct templates_params & inputs,
                                              const autoparser &              autoparser);
 };

--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -2,15 +2,11 @@
 #include "chat-auto-parser-helpers.h"
 #include "chat-peg-parser.h"
 #include "chat.h"
-#include "common.h"
 #include "log.h"
 #include "nlohmann/json.hpp"
 #include "peg-parser.h"

 #include <algorithm>
-#include <cctype>
-#include <ostream>
-#include <sstream>

 #define ANSI_RESET  "\033[0m"
 #define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
@@ -26,12 +22,8 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
 static const std::string ARG_FIRST = "AA_ARG_FST_AA";
 static const std::string ARG_SECOND = "BB_ARG_SND_BB";
 static const std::string USER_MSG = "U_USER_MSG Hello END_U";
-static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
 static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
 static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
-static const std::string CALL_ID_001 = "call00001";
-static const std::string CALL_ID_002 = "call00002";
-static const std::string CALL_ID_999 = "call99999";

 static std::vector<std::function<void(const common_chat_template & tmpl, autoparser &)>> workarounds(
    { // Old reasoning Qwen templates - they don't really display reasoning content, but we still want to
@@ -39,9 +31,8 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
          if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
              tmpl.src.find("reasoning_content") == std::string::npos &&
-              tmpl.src.find("<SPECIAL_12>") == std::string::npos &&
              analysis.reasoning.mode == reasoning_mode::NONE) {
-              analysis.reasoning.mode  = reasoning_mode::TAG_BASED;
+              analysis.reasoning.mode  = reasoning_mode::FORCED_OPEN;
              analysis.reasoning.start = "<think>";
              analysis.reasoning.end   = "</think>";
              analysis.preserved_tokens.push_back("<think>");
@@ -75,7 +66,6 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              analysis.content.end   = "<|END_OF_TURN_TOKEN|>";
              analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
              analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
-              analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
              LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
          }
      },
@@ -111,61 +101,8 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              analysis.tools.function.name_prefix  = "<｜tool▁sep｜>";
              analysis.tools.format.per_call_end   = "<｜tool▁call▁end｜>";
              analysis.tools.function.close        = "```";
-              LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
          }
-      },
-      // Nemotron Nano v2
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
-              tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
-
-              analysis.tools.format.mode           = tool_format::JSON_NATIVE;
-              analysis.tools.format.section_start  = "";
-              analysis.tools.format.section_end    = "";
-              analysis.tools.format.per_call_start = "<TOOLCALL>";
-              analysis.tools.format.per_call_end   = "</TOOLCALL>";
-              analysis.content.mode                = content_mode::PLAIN;
-              analysis.content.start               = "";
-              analysis.content.end                 = "";
-              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
-              analysis.reasoning.start             = "<think>\n\n";
-              analysis.reasoning.end               = "</think>";
-              analysis.assistant_start             = "<SPECIAL_11>Assistant";
-              analysis.user_start                  = "<SPECIAL_11>User";
-              analysis.preserved_tokens.clear();
-              analysis.preserved_tokens.push_back("<SPECIAL_12>");
-              analysis.preserved_tokens.push_back("<SPECIAL_11>");
-              analysis.preserved_tokens.push_back("</think>");
-              analysis.preserved_tokens.push_back("<TOOLCALL>");
-              analysis.preserved_tokens.push_back("</TOOLCALL>");
-              LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
-          }
-      },
-      // Fireworks
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
-            " + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
-              analysis.assistant_start             = "<|start_header_id|>assistant<|end_header_id|>";
-              analysis.user_start                  = "<|start_header_id|>user<|end_header_id|>";
-              LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
-          }
-      },
-      // Solar Open
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
-              analysis.assistant_start             = "<|begin|>assistant";
-              LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
-          }
-      },
-      // Apriel 1.6
-      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
-          if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
-              analysis.user_start                  = "<|begin_user|>";
-              analysis.assistant_start             = "<|begin_assistant|>";
-              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
-          }
-      },
-
+      }
    });

 // Common JSON structures
@@ -191,7 +128,7 @@ static json user_msg = json{
    { "content", USER_MSG }
 };

-static json build_tool_call(const std::string & name, const json & args, const std::string & id = CALL_ID_001) {
+static json build_tool_call(const std::string & name, const json & args, const std::string & id = "call00001") {
    return json{
        { "id",       id                                              },
        { "type",     "function"                                      },
@@ -199,17 +136,17 @@ static json build_tool_call(const std::string & name, const json & args, const s
    };
 }

-static json first_tool_call_zero_args         = build_tool_call(FUN_FIRST, json::object(), CALL_ID_001);
-static json first_tool_call_one_arg           = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, CALL_ID_001);
-static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, CALL_ID_001);
-static json first_tool_call_other_arg         = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, CALL_ID_001);
+static json first_tool_call_zero_args         = build_tool_call(FUN_FIRST, json::object(), "call00001");
+static json first_tool_call_one_arg           = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "XXXX" }}, "call00001");
+static json first_tool_call_one_arg_other_val = build_tool_call(FUN_FIRST, {{ ARG_FIRST, "YYYY" }}, "call00001");
+static json first_tool_call_other_arg         = build_tool_call(FUN_FIRST, {{ ARG_SECOND, "YYYY" }}, "call00001");

 static json first_tool_call =
-    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_001);
+    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00001");
 static json second_tool_call =
-    build_tool_call(FUN_SECOND, json{ { ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_002);
+    build_tool_call(FUN_SECOND, json{ { ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, "call00002");
 static json first_tool_call_alt_id =
-    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, CALL_ID_999);
+    build_tool_call(FUN_FIRST, json{{ ARG_FIRST,  "XXXX" }, { ARG_SECOND, "YYYY" }}, "call99999");

 template <typename T>
 static std::string mode_to_str(T mode) {
@@ -223,8 +160,6 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
    content = analyze_content(tmpl, reasoning);
    tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
-    assistant_start = detect_assistant_start_marker(tmpl);
-    user_start = detect_user_start_marker(tmpl);
    collect_preserved_tokens();

    for (auto & workaround : workarounds) {
@@ -232,8 +167,6 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    }

    LOG_DBG("\n--- Reasoning & Content Structure ---\n");
-    LOG_DBG("user_msg_start: %s\n", user_start.c_str());
-    LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
    LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
    LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
    LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
@@ -252,11 +185,7 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    LOG_DBG("func_name_prefix: '%s'\n", tools.function.name_prefix.c_str());
    LOG_DBG("func_name_suffix: '%s'\n", tools.function.name_suffix.c_str());
    LOG_DBG("func_close: '%s'\n", tools.function.close.c_str());
-    LOG_DBG("call_id_prefix: '%s'\n", tools.call_id.prefix.c_str());
-    LOG_DBG("call_id_suffix: '%s'\n", tools.call_id.suffix.c_str());
-    LOG_DBG("call_id_pos: '%s'\n", mode_to_str(tools.call_id.pos).c_str());
-    LOG_DBG("args_start: '%s'\n", tools.arguments.start.c_str());
-    LOG_DBG("args_end: '%s'\n", tools.arguments.end.c_str());
+    LOG_DBG("python_dict_format: %s\n", tools.format.uses_python_dicts ? "true" : "false");
    LOG_DBG("arg_name_prefix: '%s'\n", tools.arguments.name_prefix.c_str());
    LOG_DBG("arg_name_suffix: '%s'\n", tools.arguments.name_suffix.c_str());
    LOG_DBG("arg_value_prefix: '%s'\n", tools.arguments.value_prefix.c_str());
@@ -306,120 +235,6 @@ void autoparser::collect_preserved_tokens() {
    add_token(tools.call_id.suffix);
 }

-std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
-    json user_msg = json{
-        { "role",    "user"   },
-        { "content", USER_MSG }
-    };
-
-    json assistant_no_reasoning = json{
-        { "role",    "assistant"   },
-        { "content", ASSISTANT_MSG }
-    };
-
-    template_params params;
-    params.messages              = json::array({ user_msg });
-    params.add_generation_prompt = false;
-    params.enable_thinking       = true;
-
-    auto comparison = compare_variants(
-        tmpl, params, [&](template_params & p) {
-            p.messages = json::array({ user_msg, assistant_no_reasoning });
-        }
-    );
-
-    if (!comparison) {
-        LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
-        return "";
-    }
-
-    auto usermsg = comparison->diff.right;
-    if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
-        LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
-    }
-
-    auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
-    if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
-        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
-    }
-    if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
-        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
-    }
-    return trim_whitespace(ast_prefix);
-}
-
-std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
-    json user_msg = json{
-        { "role",    "user"   },
-        { "content", USER_MSG }
-    };
-
-    json assistant = json{
-        { "role",    "assistant"   },
-        { "content", ASSISTANT_MSG }
-    };
-
-    json user_msg_two = json{
-        { "role",    "user"       },
-        { "content", USER_MSG_TWO }
-    };
-
-    template_params params;
-    params.messages              = json::array({});
-    params.add_generation_prompt = false;
-    params.enable_thinking       = true;
-
-    auto comparison = compare_variants(
-        tmpl, params, [&](template_params & p) {
-            p.messages = json::array({ user_msg });
-        }
-    );
-
-    if (!comparison) {
-        LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
-        params.messages = json::array({ user_msg_two, assistant });
-        comparison = compare_variants(
-            tmpl, params, [&](template_params & p) {
-                p.messages = json::array({ user_msg_two, assistant, user_msg });
-            }
-        );
-        if (!comparison) {
-            LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
-            return "";
-        }
-    }
-
-    auto usermsg = comparison->diff.right;
-    if (usermsg.find(USER_MSG) == std::string::npos) {
-        LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
-    }
-
-    if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
-        usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
-    }
-
-    auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
-    auto candidate_split = segmentize_markers(candidate);
-    std::stringstream result;
-    bool encountered_marker = false;
-    for (const auto & mrk : candidate_split) {
-        std::string lower_mrk = std::string(mrk.value);
-        std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
-            [](unsigned char c) { return std::tolower(c); });
-        // heuristic to weed out potential end markers, but only at the start
-        if (mrk.type == segment_type::MARKER && !encountered_marker &&
-            (lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
-            continue;
-        }
-        if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
-            continue;
-        }
-        encountered_marker |= mrk.type == segment_type::MARKER;
-        result << mrk.value;
-    }
-    return trim_whitespace(result.str());
-}
-
 analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
    : analyze_base(tmpl) {
    LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
@@ -471,7 +286,7 @@ void analyze_reasoning::compare_reasoning_presence() {
            return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
        });
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        // try the more aggressive parse first, if it fails, fall back to the delimiter one
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
@@ -480,11 +295,15 @@ void analyze_reasoning::compare_reasoning_presence() {
        }
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
-                mode = reasoning_mode::TAG_BASED;
-                start = result.tags["pre"];
+                if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
+                    mode = reasoning_mode::TAG_BASED;
+                } else {
+                    mode = reasoning_mode::FORCED_CLOSED;
+                }
+                start = trim_whitespace(result.tags["pre"]);
                end   = result.tags["post"];
            } else if (!result.tags["post"].empty()) {
-                mode = reasoning_mode::TAG_BASED;
+                mode = reasoning_mode::DELIMITER;
                end = result.tags["post"];
            }
        }
@@ -512,66 +331,53 @@ void analyze_reasoning::compare_thinking_enabled() {
    const auto & diff = comparison->diff;

    std::string left_trimmed = trim_whitespace(diff.left);
-    std::string right_trimmed = trim_whitespace(diff.right);

    if (left_trimmed.empty() && !diff.right.empty()) {
+        std::string right_trimmed = trim_whitespace(diff.right);
+
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
-                start = diff.right;
-                mode  = reasoning_mode::TAG_BASED;
-            }
-        }
-    } else if (right_trimmed.empty() && !diff.left.empty()) {
-        if (!left_trimmed.empty() && string_ends_with(comparison->output_A, left_trimmed)) {
-            if (end.empty()) {
-                auto seg = prune_whitespace_segments(segmentize_markers(comparison->output_A));
-                if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
-                    start = seg[seg.size() - 2].value;
-                }
-                end = diff.left;
-                mode = reasoning_mode::TAG_BASED;
-            }
-        }
-    } else if (!left_trimmed.empty() && !right_trimmed.empty()) {
-        // Full-output diff is noisy (e.g., SmolLM3 changes the system message when enable_thinking flips).
-        // Try to find reasoning markers by tail-anchoring:
-        // one output's generation prompt tail may appear in the other with extra reasoning markers appended.
-        const auto & output_A = comparison->output_A;
-        const auto & output_B = comparison->output_B;
-        const size_t anchor_len = 64;
-
-        for (int dir = 0; dir < 2; dir++) {
-            const auto & base     = dir == 0 ? output_B : output_A;
-            const auto & extended = dir == 0 ? output_A : output_B;
-
-            size_t len = std::min(base.size(), anchor_len);
-            std::string anchor = base.substr(base.size() - len);
-            auto pos = extended.rfind(anchor);
-            if (pos == std::string::npos || pos + len >= extended.size()) {
-                continue;
-            }
-
-            std::string extra = trim_whitespace(extended.substr(pos + len));
-            if (extra.empty()) {
-                continue;
-            }
-
-            auto seg = prune_whitespace_segments(segmentize_markers(extra));
-            if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
-                if (start.empty()) {
-                    start = seg[0].value;
-                }
-                if (end.empty()) {
-                    end   = seg[1].value;
-                }
-                mode = reasoning_mode::TAG_BASED;
-                break;
+                start = right_trimmed;
+                mode  = reasoning_mode::FORCED_OPEN;
            }
        }
    }

-    if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
-        mode = reasoning_mode::TAG_BASED;
+    if (start.empty() && !end.empty()) {
+        mode = reasoning_mode::DELIMITER;
+    }
+
+    // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
+    // but enable_thinking=true produces only the start marker
+    if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
+        auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
+            return p.literal(start) + p.space() + p.literal(end) + p.rest();
+        });
+        auto parser_start_end = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
+            return p.tag("pre", p.literal(start)) + p.space() + p.negate(p.literal(end)) + p.rest();
+        });
+        if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
+            parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
+            mode = reasoning_mode::FORCED_CLOSED;
+        } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
+            auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
+            if (result.result.success()) {
+                start = result.tags["pre"];
+                mode  = reasoning_mode::FORCED_CLOSED;
+            }
+        }
+    }
+
+    if (start.empty() && end.empty()) {  // we might still have the case of "just open" and "just close"
+        if (!diff.left.empty() && !diff.right.empty()) {
+            auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
+            auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
+            if (seg_A.size() == 1 && seg_B.size() == 1) {
+                mode = reasoning_mode::FORCED_CLOSED;
+                start = seg_B[0].value;
+                end = seg_A[0].value;
+            }
+        }
    }
 }

@@ -615,7 +421,7 @@ void analyze_reasoning::compare_reasoning_scope() {
        LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);

        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
+            return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
        });
        auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
        if (result.result.success()) {
@@ -629,7 +435,7 @@ void analyze_reasoning::compare_reasoning_scope() {
            if (result.result.success()) {
                end = result.tags["post"];
            } else {
-                LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
+                LOG_DBG(ANSI_ORANGE "%s: Unable to extracft reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
                mode = reasoning_mode::NONE;
            }
        }
@@ -708,7 +514,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
        // Take the more promising diff
        std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
        auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
-            return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
+            return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
        });
        auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
        start = result.tags["pre"];
@@ -733,26 +539,23 @@ analyze_tools::analyze_tools(const common_chat_template & tmpl,
    : analyze_base(tmpl) {
    LOG_DBG(ANSI_ORANGE "Phase 3: Tool call analysis\n" ANSI_RESET);

-    analyze_tool_calls(reasoning, caps.supports_parallel_tool_calls);
+    analyze_tool_calls(reasoning);

    if (format.mode != tool_format::NONE && format.mode != tool_format::JSON_NATIVE) {
        if (caps.supports_parallel_tool_calls) {
            check_per_call_markers();
        }
-        LOG_DBG(ANSI_ORANGE "Phase 3a: Function call analysis\n" ANSI_RESET);
        extract_function_markers();
-        LOG_DBG(ANSI_ORANGE "Phase 3b: Argument analysis\n" ANSI_RESET);
        if (format.mode == tool_format::TAG_WITH_TAGGED) {
            analyze_arguments();
        }
        extract_argument_separator();
        extract_args_markers();
-        LOG_DBG(ANSI_ORANGE "Phase 3c: Call id analysis\n" ANSI_RESET);
        extract_call_id_markers();
    }
 }

-void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool supports_parallel_tool_calls) {
+void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning) {
    json assistant_no_tools = json{
        { "role",    "assistant"   },
        { "content", ASSISTANT_MSG }
@@ -786,35 +589,44 @@ void analyze_tools::analyze_tool_calls(const analyze_reasoning & reasoning, bool
        return;
    }

-    analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning, supports_parallel_tool_calls);
+    analyze_tool_call_format(tool_section, FUN_FIRST, ARG_FIRST, reasoning);
 }

 void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
                                             const std::string &       fun_name_needle,
                                             const std::string &       arg_name_needle,
-                                             const analyze_reasoning & reasoning,
-                                             bool                      supports_parallel_tool_calls) {
+                                             const analyze_reasoning & reasoning) {
    if (fun_name_needle.empty() || arg_name_needle.empty() || haystack.empty()) {
        return;
    }

-    auto in_json_haystack = [&haystack](const std::string & needle) -> bool {
+    enum class json_quote_style { NONE, DOUBLE_QUOTES, SINGLE_QUOTES };
+
+    auto in_json_haystack = [&haystack](const std::string & needle) -> json_quote_style {
        auto parser = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
            return p.choice({ p.literal("{"), p.literal(":") }) << p.choice({
+                p.tag("sq", p.literal("'") + p.literal(needle) + p.literal("'")),
                p.tag("dq", p.literal("\"") + p.literal(needle) + p.literal("\"")) });
        });
        auto result = parser.parse_anywhere_and_extract(haystack);
-        return result.result.success();
+        if (!result.result.success()) {
+            return json_quote_style::NONE;
+        }
+        return result.tags.count("sq") && !result.tags["sq"].empty()
+            ? json_quote_style::SINGLE_QUOTES
+            : json_quote_style::DOUBLE_QUOTES;
    };

    auto fun_quote = in_json_haystack(fun_name_needle);
    auto arg_quote = in_json_haystack(arg_name_needle);

-    if (fun_quote) {
+    if (fun_quote != json_quote_style::NONE) {
        // no need to check further, we're in JSON land
        format.mode = tool_format::JSON_NATIVE;
-    } else if (arg_quote) {
+        format.uses_python_dicts = (fun_quote == json_quote_style::SINGLE_QUOTES);
+    } else if (arg_quote != json_quote_style::NONE) {
        format.mode = tool_format::TAG_WITH_JSON;
+        format.uses_python_dicts = (arg_quote == json_quote_style::SINGLE_QUOTES);
    } else {
        format.mode = tool_format::TAG_WITH_TAGGED;
    }
@@ -836,9 +648,6 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,

    if (format.mode == tool_format::JSON_NATIVE) {
        analyze_tool_call_format_json_native(clean_haystack, fun_name_needle, arg_name_needle);
-        if (supports_parallel_tool_calls) {
-            analyze_json_native_parallel_calls();
-        }
    } else {
        analyze_tool_call_format_non_json(clean_haystack, fun_name_needle);
    }
@@ -847,42 +656,6 @@ void analyze_tools::analyze_tool_call_format(const std::string &       haystack,
    format.per_call_end = trim_whitespace(format.per_call_end);
 }

-void analyze_tools::analyze_json_native_parallel_calls() {
-    json assistant_one_tool = json{
-        { "role",       "assistant" },
-        { "content",    ""          },
-        { "tool_calls", json::array({ first_tool_call }) }
-    };
-
-    json assistant_two_tools = json{
-        { "role",       "assistant" },
-        { "content",    ""          },
-        { "tool_calls", json::array({ first_tool_call, second_tool_call }) }
-    };
-
-    template_params params;
-    params.messages              = json::array({ user_msg, assistant_one_tool });
-    params.tools                 = tools;
-    params.add_generation_prompt = false;
-    params.enable_thinking       = true;
-
-    auto comparison = compare_variants(
-        *tmpl, params, [&](template_params & p) { p.messages = json::array({ user_msg, assistant_two_tools }); });
-
-    if (!comparison) {
-        LOG_DBG(ANSI_ORANGE "%s: Template application failed\n" ANSI_RESET, __func__);
-        return;
-    }
-
-    std::string & second_call = comparison->diff.right;
-    if (!format.section_start.empty() && second_call.find(format.section_start) != std::string::npos) {
-        format.per_call_start = format.section_start;
-        format.per_call_end = format.section_end;
-        format.section_start.clear();
-        format.section_end.clear();
-    }
-}
-
 void analyze_tools::analyze_tool_call_format_json_native(const std::string & clean_haystack,
                                                         const std::string & fun_name_needle,
                                                         const std::string & arg_name_needle) {
@@ -1178,6 +951,8 @@ void analyze_tools::extract_function_markers() {
 }

 void analyze_tools::analyze_arguments() {
+    LOG_DBG(ANSI_ORANGE "Phase 4: Argument analysis\n" ANSI_RESET);
+
    extract_argument_name_markers();
    extract_argument_value_markers();
 }
@@ -1386,7 +1161,7 @@ void analyze_tools::extract_args_markers() {

    const auto & diff = comparison->diff;

-    if (format.mode == tool_format::JSON_NATIVE) {
+    if (format.mode != tool_format::JSON_NATIVE) {
        std::string prefix_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
        std::string suffix_marker = !format.section_end.empty() ? format.section_end : format.per_call_end;
        // these might happen earlier in the tools section as an example or somewhere else, so we need to find the closest ones
@@ -1408,10 +1183,6 @@ void analyze_tools::extract_args_markers() {
            if (find_fun != std::string::npos) {
                args_start = args_start.substr(find_fun + FUN_FIRST.size(), args_start.size() - find_fun - FUN_FIRST.size());
            }
-            size_t find_call_id = args_start.find(CALL_ID_001);
-            if (find_call_id != std::string::npos) {
-                args_start = args_start.substr(find_call_id + CALL_ID_001.size(), args_start.size() - find_call_id - CALL_ID_001.size());
-            }
            arguments.start = args_start;
            arguments.end   = args_end;
        }
@@ -1451,8 +1222,8 @@ void analyze_tools::extract_call_id_markers() {
        return;
    }

-    std::string id_value_1 = CALL_ID_001;
-    std::string id_value_2 = CALL_ID_999;
+    std::string id_value_1 = "call00001";
+    std::string id_value_2 = "call99999";

    size_t common_id_prefix_len = 0;
    for (size_t i = 0; i < std::min(id_value_1.length(), id_value_2.length()); i++) {
@@ -1551,14 +1322,6 @@ void analyze_tools::extract_call_id_markers() {
        call_id.suffix = find_first_marker(before_func);
    }

-    if (call_id.prefix == arguments.end) {
-        call_id.prefix = "";
-    }
-
-    if (call_id.suffix == arguments.start) {
-        call_id.suffix = "";
-    }
-
    // When call_id is detected, per_call_end may have been incorrectly set to include
    // the call_id_suffix and sample args. Clear it if it starts with call_id_suffix.
    if (call_id.pos != call_id_position::NONE && !call_id.suffix.empty() &&
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -214,10 +214,6 @@ std::string & common_chat_peg_mapper::args_target() {
    return (current_tool && !current_tool->name.empty()) ? current_tool->arguments : args_buffer;
 }

-std::string common_chat_peg_mapper::normalize_container_value(const std::string & input) {
-    return normalize_quotes_to_json(input);
-}
-
 void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
                                      const common_peg_parse_result & parse_result_arg) {
    arena.visit(parse_result_arg, [this](const common_peg_ast_node & node) { map(node); });
@@ -233,20 +229,6 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
        result.tool_calls.push_back(pending_tool_call.value());
        pending_tool_call.reset();
    }
-
-    // Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
-    if (!result.reasoning_content.empty()) {
-        bool all_whitespace = true;
-        for (char c : result.reasoning_content) {
-            if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
-                all_whitespace = false;
-                break;
-            }
-        }
-        if (all_whitespace) {
-            result.reasoning_content.clear();
-        }
-    }
 }

 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
@@ -356,9 +338,37 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            // For potential containers, normalize Python-style single quotes to JSON double quotes
            bool is_potential_container = value_content[0] == '[' || value_content[0] == '{';
            if (is_potential_container) {
-                value_content = normalize_container_value(value_content);
+                value_content = normalize_quotes_to_json(value_content);
+            }
+
+            // Try to parse as JSON value (number, bool, null, object, array)
+            try {
+                ordered_json parsed = ordered_json::parse(value_content);
+                if (parsed.is_string()) {
+                    // Don't add closing quote yet (added by arg_close) for monotonic streaming
+                    std::string escaped = parsed.dump();
+                    if (!escaped.empty() && escaped.back() == '"') {
+                        escaped.pop_back();
+                    }
+                    value_to_add          = escaped;
+                    closing_quote_pending = true;
+                } else {
+                    // Non-string values: use raw content to preserve whitespace for monotonicity
+                    value_to_add = value_content;
+                }
+            } catch (...) {
+                if (node.is_partial && is_potential_container) {
+                    // Partial container: pass through the already-normalized content
+                    value_to_add = value_content;
+                } else {
+                    // Not valid JSON - treat as string value
+                    if (!closing_quote_pending) {
+                        value_to_add          = "\"";
+                        closing_quote_pending = true;
+                    }
+                    value_to_add += escape_json_string_inner(value_content);
+                }
            }
-            value_to_add += value_content;
        }

        args_target() += value_to_add;
@@ -648,7 +658,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_nested_keys(
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();

        auto nested_name = literal("\"" + nested_name_field + "\"") + space() + literal(":") + space() +
-                          atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
+                          literal("\"") + tool_name(literal(name)) + literal("\"");
        auto nested_args = literal("\"" + nested_args_field + "\"") + space() + literal(":") + space() +
                          tool_args(schema(json(), "tool-" + name + "-schema", params));

@@ -716,7 +726,7 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
        ordered_json   params   = function.contains("parameters") ? function.at("parameters") : ordered_json::object();

        auto tool_name_ = name_key_parser + space() + literal(":") + space() +
-                         atomic(literal("\"") + tool_name(literal(name)) + literal("\""));
+                         literal("\"") + tool_name(literal(name)) + literal("\"");
        auto tool_args_ = args_key_parser + space() + literal(":") + space() +
                         tool_args(schema(json(), "tool-" + name + "-schema", params));

@@ -778,42 +788,6 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys(
    return tool_choices;
 }

-common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const std::string & delimiter) {
-    if (s.empty()) {
-        return eps();
-    }
-    if (delimiter.empty()) {
-        return literal(s);
-    }
-    return literal(s.substr(0, s.find(delimiter)));
-}
-
-common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
-    auto parser = eps();
-    size_t end_of_prefix_space = tag.size();
-    size_t start_of_suffix_space = tag.size();
-    for (size_t i = 0; i < tag.size(); i++) {
-        if (!std::isspace(tag[i])) {
-            end_of_prefix_space = i;
-            break;
-        }
-    }
-    for (size_t i = tag.size(); i > 0; i--) {
-        if (!std::isspace(tag[i - 1])) {
-            start_of_suffix_space = i;
-            break;
-        }
-    }
-    for (size_t i = 0; i < end_of_prefix_space; i++) {
-        parser += optional(literal(std::string(1, tag[i])));
-    }
-    parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
-    for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
-        parser += optional(literal(std::string(1, tag[i])));
-    }
-    return parser;
-}
-
 common_peg_parser common_chat_peg_builder::standard_json_tools(
                                                       const std::string &              section_start,
                                                       const std::string &              section_end,
@@ -863,143 +837,3 @@ common_peg_parser common_chat_peg_builder::standard_json_tools(

    return force_tool_calls ? section : optional(section);
 }
-
-void common_chat_peg_gemma4_mapper::from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result) {
-    for (const auto & node : result.nodes) {
-        visit(arena, node);
-    }
-}
-
-static std::string gemma4_to_json(const common_peg_ast_arena & arena, common_peg_ast_id id) {
-    const auto & node = arena.get(id);
-
-    if (node.text.empty()) {
-        return "";
-    }
-
-    if (node.rule == "gemma4-number" || node.rule == "gemma4-bool" || node.rule == "gemma4-null") {
-        return std::string(node.text);
-    }
-
-    if (node.rule == "gemma4-string-content") {
-        return escape_json_string_inner(std::string(node.text));
-    }
-
-    if (node.rule == "gemma4-string") {
-        std::string result = "\"";
-        if (!node.children.empty()) {
-            result += gemma4_to_json(arena, node.children[0]);
-            if (!node.is_partial) {
-                result += "\"";
-            }
-        }
-        return result;
-    }
-
-    if (node.rule == "gemma4-array") {
-        std::string result = "[";
-
-        bool add_comma = false;
-        for (auto child_id : node.children) {
-            if (add_comma) {
-                result += ',';
-            }
-            add_comma = true;
-            result += gemma4_to_json(arena, child_id);
-        }
-
-        if (!node.is_partial) {
-            result += ']';
-        }
-        return result;
-    }
-
-    if (node.rule == "gemma4-dict-key-name") {
-        return std::string(node.text);
-    }
-
-    if (node.rule == "gemma4-dict-key") {
-        std::string result = "\"";
-        if (!node.children.empty()) {
-            result += escape_json_string_inner(gemma4_to_json(arena, node.children[0]));
-        }
-        if (!node.is_partial) {
-            result += "\":";
-        }
-        return result;
-    }
-
-    if (node.rule == "gemma4-dict-kv") {
-        std::string result;
-        for (auto child_id : node.children) {
-            result += gemma4_to_json(arena, child_id);
-        }
-        return result;
-    }
-
-    if (node.rule == "gemma4-dict") {
-        std::string result = "{";
-
-        bool add_comma = false;
-        for (auto child_id : node.children) {
-            if (add_comma) {
-                result += ',';
-            }
-            add_comma = true;
-            result += gemma4_to_json(arena, child_id);
-        }
-
-        if (!node.is_partial) {
-            result += '}';
-        }
-        return result;
-    }
-
-    if (node.rule == "gemma4-value") {
-        if (!node.children.empty()) {
-            return gemma4_to_json(arena, node.children[0]);
-        }
-        return "";
-    }
-
-    return "";
-}
-
-void common_chat_peg_gemma4_mapper::visit(const common_peg_ast_arena & arena, common_peg_ast_id id) {
-    const auto & node = arena.get(id);
-
-    if (node.tag == "reasoning") {
-        result.reasoning_content += std::string(node.text);
-        return;
-    }
-
-    if (node.tag == "content") {
-        result.content += std::string(node.text);
-        return;
-    }
-
-    if (node.tag == "tool") {
-        auto name_id = arena.find_by_tag(node, "tool-name");
-        auto args_id = arena.find_by_tag(node, "tool-args");
-
-        if (name_id != COMMON_PEG_INVALID_AST_ID && args_id != COMMON_PEG_INVALID_AST_ID) {
-            const auto & name_node = arena.get(name_id);
-            const auto & args_node = arena.get(args_id);
-
-            if (!name_node.is_partial) {
-                common_chat_tool_call call;
-                call.name = std::string(name_node.text);
-                if (!args_node.children.empty()) {
-                    call.arguments = gemma4_to_json(arena, args_node.children[0]);
-                }
-                result.tool_calls.push_back(call);
-            }
-        }
-
-        return;
-    }
-
-    for (auto child_id : node.children) {
-        visit(arena, child_id);
-    }
-}
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -17,9 +17,7 @@ class common_chat_peg_mapper {

    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
    virtual void map(const common_peg_ast_node & node);
-  protected:
-    virtual std::string normalize_container_value(const std::string & input);
-  private:
+    private:
      // Tool call handling state
      std::optional<common_chat_tool_call> pending_tool_call;  // Tool call waiting for name
      common_chat_tool_call *              current_tool          = nullptr;
@@ -32,14 +30,6 @@ class common_chat_peg_mapper {
      std::string & args_target();
 };

-class common_chat_peg_gemma4_mapper : public common_chat_peg_mapper {
-  public:
-    common_chat_peg_gemma4_mapper(common_chat_msg & msg) : common_chat_peg_mapper(msg) {}
-    virtual void from_ast(const common_peg_ast_arena & arena, const common_peg_parse_result & result);
-  private:
-    void visit(const common_peg_ast_arena & arena, common_peg_ast_id id);
-};
-
 struct content_structure;
 struct tool_call_structure;

@@ -90,14 +80,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {

    // Use for schema-declared string types - won't be treated as potential JSON container
    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }
-
-
-    // Return a parser that parses the prefix of a string, up to a given delimiter.
-    common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
-
-    // Return a parser that parses all elements of tag, but leading and trailing spaces are optional
-    common_peg_parser optspace(const std::string & tag);
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }

    // Legacy-compatible helper for building standard JSON tool calls
    // Used by tests and manual parsers
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -3,12 +3,12 @@
 #pragma once

 #include "common.h"
-#include "peg-parser.h"
 #include "jinja/parser.h"
+#include "nlohmann/json_fwd.hpp"
+#include "peg-parser.h"
 #include "jinja/runtime.h"
 #include "jinja/caps.h"
-
-#include "nlohmann/json_fwd.hpp"
+#include "nlohmann/json.hpp"

 #include <chrono>
 #include <functional>
@@ -19,10 +19,12 @@
 using chat_template_caps = jinja::caps;
 using json = nlohmann::ordered_json;

+#include <nlohmann/json_fwd.hpp>
+
 struct common_chat_templates;

 namespace autoparser {
-struct generation_params;
+struct templates_params;
 }  // namespace autoparser

 struct common_chat_tool_call {
@@ -73,9 +75,41 @@ struct common_chat_template {
    const std::string & bos_token() const { return bos_tok; }
    const std::string & eos_token() const { return eos_tok; }

+    // TODO: this is ugly, refactor it somehow
+    json add_system(const json & messages, const std::string & system_prompt) const {
+        GGML_ASSERT(messages.is_array());
+        auto msgs_copy = messages;
+        if (!caps.supports_system_role) {
+            if (msgs_copy.empty()) {
+                msgs_copy.insert(msgs_copy.begin(), json{
+                    {"role", "user"},
+                    {"content", system_prompt}
+                });
+            } else {
+                auto & first_msg = msgs_copy[0];
+                if (!first_msg.contains("content")) {
+                    first_msg["content"] = "";
+                }
+                first_msg["content"] = system_prompt + "\n\n"
+                    + first_msg["content"].get<std::string>();
+            }
+        } else {
+            if (msgs_copy.empty() || msgs_copy[0].at("role") != "system") {
+                msgs_copy.insert(msgs_copy.begin(), json{
+                    {"role", "system"},
+                    {"content", system_prompt}
+                });
+            } else if (msgs_copy[0].at("role") == "system") {
+                msgs_copy[0]["content"] = system_prompt;
+            }
+        }
+        return msgs_copy;
+    }
+
    chat_template_caps original_caps() const {
        return caps;
    }
+
 };

 struct common_chat_msg {
@@ -89,22 +123,11 @@ struct common_chat_msg {

    nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;

-    std::string render_content(const std::string & delimiter = "\n\n") const;
-
    bool empty() const {
        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() &&
               tool_name.empty() && tool_call_id.empty();
    }

-    bool contains_media() const {
-        for (const auto & part : content_parts) {
-            if (part.type == "media_marker") {
-                return true;
-            }
-        }
-        return false;
-    }
-
    void set_tool_call_ids(std::vector<std::string> &           ids_cache,
                           const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
@@ -143,17 +166,6 @@ struct common_chat_msg_diff {
    }
 };

-struct common_chat_msg_span {
-    std::string role;
-    std::size_t pos = 0;
-    std::size_t len = 0;
-};
-
-struct common_chat_msg_delimiter {
-    std::string role;
-    std::string delimiter;
-};
-
 struct common_chat_tool {
    std::string name;
    std::string description;
@@ -172,27 +184,16 @@ enum common_chat_format {
    // These are intended to be parsed by the PEG parser
    COMMON_CHAT_FORMAT_PEG_SIMPLE,
    COMMON_CHAT_FORMAT_PEG_NATIVE,
-    COMMON_CHAT_FORMAT_PEG_GEMMA4,

    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
 };

-
-// Continuation method provided via `continue_final_message`
-enum common_chat_continuation {
-    COMMON_CHAT_CONTINUATION_NONE,
-    COMMON_CHAT_CONTINUATION_AUTO,
-    COMMON_CHAT_CONTINUATION_REASONING,
-    COMMON_CHAT_CONTINUATION_CONTENT,
-};
-
 struct common_chat_templates_inputs {
    std::vector<common_chat_msg>          messages;
    std::string                           grammar;
    std::string                           json_schema;
-    bool                                  add_generation_prompt  = true;
-    common_chat_continuation              continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
-    bool                                  use_jinja              = true;
+    bool                                  add_generation_prompt = true;
+    bool                                  use_jinja             = true;
    // Parameters below only supported when use_jinja is true
    std::vector<common_chat_tool>         tools;
    common_chat_tool_choice               tool_choice         = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -203,7 +204,6 @@ struct common_chat_templates_inputs {
    std::map<std::string, std::string>    chat_template_kwargs;
    bool                                  add_bos = false;
    bool                                  add_eos = false;
-    bool                                  force_pure_content = false;
 };

 struct common_chat_params {
@@ -211,7 +211,7 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy         = false;
-    std::string                         generation_prompt;
+    bool                                thinking_forced_open = false;
    bool                                supports_thinking    = false;
    std::string                         thinking_start_tag;  // e.g., "<think>"
    std::string                         thinking_end_tag;    // e.g., "</think>"
@@ -219,7 +219,6 @@ struct common_chat_params {
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
    std::string                         parser;
-    std::vector<common_chat_msg_span>   message_spans;
 };

 // per-message parsing syntax
@@ -229,16 +228,14 @@ struct common_chat_parser_params {
    common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool                    reasoning_in_content = false;
-    std::string             generation_prompt;
+    bool                    thinking_forced_open = false;
    bool                    parse_tool_calls     = true;
-    bool                    is_continuation      = false;
-    bool                    echo                 = false;  // Include assistant prefilled msg in output
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
-        format  = chat_params.format;
-        generation_prompt = chat_params.generation_prompt;
+        format               = chat_params.format;
+        thinking_forced_open = chat_params.thinking_forced_open;
    }
 };

@@ -258,8 +255,8 @@ common_chat_templates_ptr common_chat_templates_init(const struct llama_model *
                                                     const std::string &        bos_token_override = "",
                                                     const std::string &        eos_token_override = "");

-bool        common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
+bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");

 struct common_chat_params common_chat_templates_apply(const struct common_chat_templates *        tmpls,
                                                      const struct common_chat_templates_inputs & inputs);
@@ -276,9 +273,9 @@ std::string common_chat_format_example(const struct common_chat_templates *
                                       bool                                       use_jinja,
                                       const std::map<std::string, std::string> & chat_template_kwargs);

-const char *    common_chat_format_name(common_chat_format format);
-common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
-common_chat_msg common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);
+const char *            common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & params);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & src_parser, const std::string & input, bool is_partial, const common_chat_parser_params & params);

 // used by arg and server
 const char *            common_reasoning_format_name(common_reasoning_format format);
@@ -291,39 +288,20 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates *
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages);

-std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
-
-common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value);
-
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

+std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);
 nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);

+nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
+
 // get template caps, useful for reporting to server /props endpoint
 std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_templates * chat_templates);

 std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
-    const autoparser::generation_params & inputs);
-
-std::string common_chat_template_generation_prompt(
-    const common_chat_template &          tmpl,
-    const autoparser::generation_params & inputs);
-
-std::optional<common_chat_params> common_chat_try_specialized_template(
-        const common_chat_template &          tmpl,
-        const std::string &                   src,
-        autoparser::generation_params & params);
-
-
-// specialized per-task preset
-struct common_chat_prompt_preset {
-    std::string system;
-    std::string user;
-};
-
-common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
-
-std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
-
+    const autoparser::templates_params & inputs,
+    const std::optional<json> & messages_override = std::nullopt,
+    const std::optional<json> & tools_override = std::nullopt,
+    const std::optional<json> & additional_context = std::nullopt);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,13 +1,10 @@
 #include "ggml.h"
 #include "gguf.h"

-#include "build-info.h"
 #include "common.h"
-#include "fit.h"
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
-#include "speculative.h"
 #include "unicode.h"

 #include <algorithm>
@@ -71,7 +68,7 @@ common_time_meas::~common_time_meas() {
 // CPU utils
 //

-int32_t common_cpu_get_num_physical_cores() {
+int32_t cpu_get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
@@ -186,11 +183,11 @@ static int cpu_count_math_cpus(int n_cpu) {
 /**
 * Returns number of CPUs on system that are useful for math.
 */
-int32_t common_cpu_get_num_math() {
+int32_t cpu_get_num_math() {
 #if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
    int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
    if (n_cpu < 1) {
-        return common_cpu_get_num_physical_cores();
+        return cpu_get_num_physical_cores();
    }
    if (is_hybrid_cpu()) {
        cpu_set_t affinity;
@@ -203,7 +200,7 @@ int32_t common_cpu_get_num_math() {
        }
    }
 #endif
-    return common_cpu_get_num_physical_cores();
+    return cpu_get_num_physical_cores();
 }

 // Helper for setting process priority
@@ -264,7 +261,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 //


-void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
    int32_t n_set = 0;

    if (cpuparams.n_threads < 0) {
@@ -272,7 +269,7 @@ void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_para
        if (role_model != nullptr) {
            cpuparams = *role_model;
        } else {
-            cpuparams.n_threads = common_cpu_get_num_math();
+            cpuparams.n_threads = cpu_get_num_math();
        }
    }

@@ -362,38 +359,15 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
-#if defined(_WIN32)
-    SetConsoleOutputCP(CP_UTF8);
-    SetConsoleCP(CP_UTF8);
-#endif
-
-    common_log_set_prefix(common_log_main(), true);
-    common_log_set_timestamps(common_log_main(), true);
-
    llama_log_set(common_log_default_callback, NULL);
-}

-void common_params_print_info(const common_params & params, bool print_devices) {
 #ifdef NDEBUG
    const char * build_type = "";
 #else
    const char * build_type = " (debug)";
 #endif
-    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
-
-    // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
-    if (print_devices) {
-        LOG_INF("device_info:\n");
-        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-            auto * dev = ggml_backend_dev_get(i);
-            size_t free, total;
-            ggml_backend_dev_memory(dev, &free, &total);
-            LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
-        }
-    }
-    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -445,27 +419,6 @@ std::string string_strip(const std::string & str) {
    return str.substr(start, end - start);
 }

-std::string string_lcs(std::string_view a, std::string_view b) {
-    if (a.empty() || b.empty()) return {};
-
-    std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
-    size_t best_len = 0;
-    size_t best_end_a = 0;
-
-    for (size_t i = 1; i <= a.size(); ++i) {
-        for (size_t j = 1; j <= b.size(); ++j) {
-            if (a[i - 1] == b[j - 1]) {
-                dp[i][j] = dp[i - 1][j - 1] + 1;
-                if (dp[i][j] > best_len) {
-                    best_len = dp[i][j];
-                    best_end_a = i;
-                }
-            }
-        }
-    }
-    return std::string(a.substr(best_end_a - best_len, best_len));
-}
-
 std::string string_get_sortable_timestamp() {
    using clock = std::chrono::system_clock;

@@ -703,97 +656,6 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
    return true;
 }

-static inline bool glob_class_match(const char c, const char * pattern, const char * class_end) {
-    const char * class_start = pattern;
-    bool negated = false;
-
-    if (*class_start == '!') {
-        negated = true;
-        class_start++;
-    }
-
-    // If first character after negation is ']' or '-', treat it as literal
-    if (*class_start == ']' || *class_start == '-') {
-        if (class_start < class_end && *class_start == c) {
-            return !negated;
-        }
-        class_start++;
-    }
-
-    bool matched = false;
-
-    while (class_start < class_end) {
-        if (class_start + 2 < class_end && class_start[1] == '-' && class_start[2] != ']') {
-            char start_char = *class_start;
-            char end_char = class_start[2];
-            if (c >= start_char && c <= end_char) {
-                matched = true;
-                break;
-            }
-            class_start += 3;
-        } else {
-            if (*class_start == c) {
-                matched = true;
-                break;
-            }
-            class_start++;
-        }
-    }
-
-    return negated ? !matched : matched;
-}
-
-// simple glob: * matches non-/ chars, ** matches anything including /, [] matches character class
-static inline bool glob_match(const char * pattern, const char * str) {
-    if (*pattern == '\0') {
-        return *str == '\0';
-    }
-    if (pattern[0] == '*' && pattern[1] == '*') {
-        const char * p = pattern + 2;
-        if (glob_match(p, str)) return true;
-        if (*str != '\0') return glob_match(pattern, str + 1);
-        return false;
-    }
-    if (*pattern == '*') {
-        const char * p = pattern + 1;
-        for (; *str != '\0' && *str != '/'; str++) {
-            if (glob_match(p, str)) return true;
-        }
-        return glob_match(p, str);
-    }
-    if (*pattern == '?' && *str != '\0' && *str != '/') {
-        return glob_match(pattern + 1, str + 1);
-    }
-    if (*pattern == '[') {
-        const char * class_end = pattern + 1;
-        // If first character after '[' is ']' or '-', treat it as literal
-        if (*class_end == ']' || *class_end == '-') {
-            class_end++;
-        }
-        while (*class_end != '\0' && *class_end != ']') {
-            class_end++;
-        }
-        if (*class_end == ']') {
-            if (*str == '\0') return false;
-            bool matched = glob_class_match(*str, pattern + 1, class_end);
-            return matched && glob_match(class_end + 1, str + 1);
-        } else {
-            if (*str == '[') {
-                return glob_match(pattern + 1, str + 1);
-            }
-            return false;
-        }
-    }
-    if (*pattern == *str) {
-        return glob_match(pattern + 1, str + 1);
-    }
-    return false;
-}
-
-bool glob_match(const std::string & pattern, const std::string & str) {
-    return glob_match(pattern.c_str(), str.c_str());
-}
-
 //
 // Filesystem utils
 //
@@ -1181,20 +1043,19 @@ struct common_init_result::impl {
    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };

-common_init_result::common_init_result(common_params & params, bool model_only) :
+common_init_result::common_init_result(common_params & params) :
    pimpl(new impl{}) {
    auto mparams = common_model_params_to_llama(params);
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory ...\n", __func__);
-        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
-        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
+        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
            params.fit_params_target.data(),
            params.fit_params_min_ctx,
-            params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -1204,13 +1065,9 @@ common_init_result::common_init_result(common_params & params, bool model_only)

    pimpl->model.reset(model);

-    if (model_only) {
-        return;
-    }
-
    const llama_vocab * vocab = llama_model_get_vocab(model);

-    // load and optionally apply lora adapters
+    // load and optionally apply lora adapters (must be loaded before context creation)
    for (auto & la : params.lora_adapters) {
        llama_adapter_lora_ptr lora;
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
@@ -1241,7 +1098,7 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1254,12 +1111,12 @@ common_init_result::common_init_result(common_params & params, bool model_only)
    }

    //if (params.sampling.penalty_last_n == -1) {
-    //    LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
    //}

    //if (params.sampling.dry_penalty_last_n == -1) {
-    //    LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

@@ -1295,9 +1152,6 @@ llama_context * common_init_result::context() {
 }

 common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
-    if (seq_id < 0 || seq_id >= (int) pimpl->samplers.size()) {
-        return nullptr;
-    }
    return pimpl->samplers[seq_id].get();
 }

@@ -1311,8 +1165,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }

-common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
-    common_init_result_ptr res(new common_init_result(params, model_only));
+common_init_result_ptr common_init_from_params(common_params & params) {
+    common_init_result_ptr res(new common_init_result(params));

    llama_model * model = res->model();
    if (model == NULL) {
@@ -1320,10 +1174,6 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
        return res;
    }

-    if (model_only) {
-        return res;
-    }
-
    llama_context * lctx = res->context();
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
@@ -1387,7 +1237,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode
    }

    if (params.warmup) {
-        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

        llama_set_warmup(lctx, true);

@@ -1432,7 +1282,7 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode

 common_init_result::~common_init_result() = default;

-std::string common_get_model_endpoint() {
+std::string get_model_endpoint() {
    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
@@ -1447,65 +1297,6 @@ std::string common_get_model_endpoint() {
    return model_endpoint;
 }

-common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
-    auto * mem = llama_get_memory(ctx);
-    if (mem == nullptr) {
-        return COMMON_CONTEXT_SEQ_RM_TYPE_NO;
-    }
-
-    common_context_seq_rm_type res = COMMON_CONTEXT_SEQ_RM_TYPE_PART;
-
-    llama_memory_clear(mem, true);
-
-    // eval 2 tokens to check if the context is compatible
-    std::vector<llama_token> tmp;
-    tmp.push_back(0);
-    tmp.push_back(0);
-
-    int ret = llama_decode(ctx, llama_batch_get_one(tmp.data(), tmp.size()));
-    if (ret != 0) {
-        LOG_ERR("%s: llama_decode() failed: %d\n", __func__, ret);
-        res = COMMON_CONTEXT_SEQ_RM_TYPE_NO;
-        goto done;
-    }
-
-    if (llama_n_rs_seq(ctx) > 0) {
-        LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__);
-        res = COMMON_CONTEXT_SEQ_RM_TYPE_RS;
-        goto done;
-    }
-
-    // try to remove the last tokens
-    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
-        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
-        goto done;
-    }
-
-done:
-    llama_memory_clear(mem, true);
-    llama_synchronize(ctx);
-
-    return res;
-}
-
-void common_context_seq_rm(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    auto * mem = llama_get_memory(ctx);
-    if (!llama_memory_seq_rm(mem, seq_id, p0, p1)) {
-        GGML_ABORT("%s", string_format("failed to remove sequence %d with p0=%d, p1=%d\n", seq_id, p0, p1).c_str());
-    }
-}
-
-void common_context_seq_cp(llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
-    auto * mem = llama_get_memory(ctx);
-    llama_memory_seq_cp(mem, seq_id_src, seq_id_dst, p0, p1);
-}
-
-void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
-    auto * mem = llama_get_memory(ctx);
-    llama_memory_seq_add(mem, seq_id, p0, p1, delta);
-}
-
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
    std::vector<llama_adapter_lora *> loras;
    std::vector<float> scales;
@@ -1552,7 +1343,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {

    mparams.progress_callback           = params.load_progress_callback;
    mparams.progress_callback_user_data = params.load_progress_callback_user_data;
-    mparams.no_alloc                    = params.no_alloc;

    return mparams;
 }
@@ -1562,7 +1352,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &

    cparams.n_ctx             = params.n_ctx;
    cparams.n_seq_max         = params.n_parallel;
-    cparams.n_rs_seq          = params.speculative.need_n_rs_seq();
    cparams.n_batch           = params.n_batch;
    cparams.n_ubatch          = params.n_ubatch;
    cparams.n_threads         = params.cpuparams.n_threads;
@@ -1594,7 +1383,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    return cparams;
 }

-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
    struct ggml_threadpool_params tpp;

    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
@@ -2033,110 +1822,3 @@ bool common_prompt_batch_decode(

    return true;
 }
-
-size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
-}
-
-bool common_prompt_checkpoint::empty() const {
-    return data_tgt.empty();
-}
-
-void common_prompt_checkpoint::clear() {
-    n_tokens = 0;
-
-    pos_min = 0;
-    pos_max = 0;
-
-    data_tgt.clear();
-    data_dft.clear();
-}
-
-void common_prompt_checkpoint::update_pos(
-        int64_t n_tokens,
-        llama_pos pos_min,
-        llama_pos pos_max) {
-    this->n_tokens = n_tokens;
-    this->pos_min  = pos_min;
-    this->pos_max  = pos_max;
-}
-
-void common_prompt_checkpoint::update_tgt(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
-
-    data_tgt.resize(ckpt_size);
-
-    const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
-    if (n != ckpt_size) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
-    }
-}
-
-void common_prompt_checkpoint::update_dft(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);
-
-    data_dft.resize(ckpt_size);
-
-    const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
-    if (n != ckpt_size) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
-    }
-}
-
-void common_prompt_checkpoint::load_tgt(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) const {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    if (data_tgt.empty()) {
-        return;
-    }
-
-    const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
-    if (n != data_tgt.size()) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
-    }
-}
-
-void common_prompt_checkpoint::load_dft(
-        llama_context * ctx,
-        llama_seq_id seq_id,
-        llama_state_seq_flags flags) const {
-    if (ctx == nullptr) {
-        return;
-    }
-
-    if (data_dft.empty()) {
-        return;
-    }
-
-    const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
-    if (n != data_dft.size()) {
-        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
-    }
-}
-
-void common_prompt_checkpoint::clear_tgt() {
-    data_tgt.clear();
-}
-
-void common_prompt_checkpoint::clear_dft() {
-    data_dft.clear();
-}
--- a/common/common.h
+++ b/common/common.h
@@ -2,10 +2,8 @@

 #pragma once

-#include "llama-cpp.h"
-
 #include "ggml-opt.h"
-#include "ggml.h"
+#include "llama-cpp.h"

 #include <set>
 #include <sstream>
@@ -13,7 +11,6 @@
 #include <string_view>
 #include <vector>
 #include <map>
-#include <algorithm>

 #if defined(_WIN32) && !defined(_WIN32_WINNT)
 #define _WIN32_WINNT 0x0A00
@@ -28,6 +25,11 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

+#define print_build_info() do {                                                                     \
+    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
+} while(0)
+
 struct common_time_meas {
    common_time_meas(int64_t & t_acc, bool disable = false);
    ~common_time_meas();
@@ -49,13 +51,21 @@ struct common_adapter_lora_info {

 using llama_tokens = std::vector<llama_token>;

+// build info
+extern int LLAMA_BUILD_NUMBER;
+extern const char * LLAMA_COMMIT;
+extern const char * LLAMA_COMPILER;
+extern const char * LLAMA_BUILD_TARGET;
+
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
 struct common_control_vector_load_info;

 //
 // CPU utils
 //

-struct common_cpu_params {
+struct cpu_params {
    int      n_threads                   = -1;
    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
    bool     mask_valid                  = false;   // Default: any CPU
@@ -64,8 +74,8 @@ struct common_cpu_params {
    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };

-int32_t common_cpu_get_num_physical_cores();
-int32_t common_cpu_get_num_math();
+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();

 //
 // Common params
@@ -158,10 +168,9 @@ enum common_params_sampling_config : uint64_t {

 enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
-    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
+    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
+    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
    COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
@@ -169,43 +178,6 @@ enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_COUNT          // number of types, unknown type
 };

-// Grammar type enumeration
-enum common_grammar_type {
-    COMMON_GRAMMAR_TYPE_NONE,           // no grammar set
-    COMMON_GRAMMAR_TYPE_USER,           // user-provided GBNF (--grammar / "grammar" API field)
-    COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT,  // auto-generated from JSON schema (--json-schema / "json_schema" API field)
-    COMMON_GRAMMAR_TYPE_TOOL_CALLS,     // auto-generated by chat template parser for function calling
-};
-
-// Grammar variant struct with type and grammar string
-struct common_grammar {
-    common_grammar_type type = COMMON_GRAMMAR_TYPE_NONE;
-    std::string grammar;
-
-    // Default constructor - no grammar
-    common_grammar() = default;
-
-    // Constructor with type and grammar string
-    common_grammar(common_grammar_type t, std::string g) : type(t), grammar(std::move(g)) {
-        GGML_ASSERT(type != COMMON_GRAMMAR_TYPE_NONE || !grammar.empty());
-    }
-
-    // Check if a grammar is set
-    bool empty() const { return type == COMMON_GRAMMAR_TYPE_NONE || grammar.empty(); }
-};
-
-// Returns the raw grammar string, or empty string if no grammar is set.
-inline const std::string & common_grammar_value(const common_grammar & g) {
-    return g.grammar;
-}
-
-// Returns true when the generation_prompt should be prefilled into the grammar sampler.
-// Only output-format and tool-call grammars need prefill; user-supplied grammars must not be prefilled.
-inline bool common_grammar_needs_prefill(const common_grammar & g) {
-    return g.type == COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT
-        || g.type == COMMON_GRAMMAR_TYPE_TOOL_CALLS;
-}
-
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -256,7 +228,7 @@ struct common_params_sampling {
        COMMON_SAMPLER_TYPE_TEMPERATURE,
    };

-    common_grammar              grammar;      // optional grammar constraint (user / output-format / tool-calls)
+    std::string                         grammar; // optional BNF-like grammar to constrain sampling
    bool                                grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
    std::set<llama_token>               preserved_tokens;
@@ -264,19 +236,13 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

-    // The assistant generation prompt already prefilled into the prompt.
-    // Fed to the grammar sampler (to advance past pre-existing tokens) and used
-    // to determine the reasoning budget sampler's initial state.
-    // Only applied when the grammar is of output-format or tool-calls type.
-    std::string generation_prompt;
-
    // reasoning budget sampler parameters
    // these are populated by the server/CLI based on chat template params
    int32_t                  reasoning_budget_tokens   = -1;   // -1 = disabled, >= 0 = token budget
+    bool                     reasoning_budget_activate_immediately = false;
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
-    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@@ -297,84 +263,62 @@ struct common_params_model {
    std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };

-// draft-model-based speculative decoding parameters
-struct common_params_speculative_draft {
-    int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
+struct common_ngram_mod;

-    float p_split = 0.1f; // speculative decoding split probability
-    float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
+struct common_params_speculative {
+    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding

-    bool backend_sampling = true; // offload draft sampling to the backend (default: on)
+    // general-purpose speculative decoding parameters

-    common_params_model mparams;
+    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
+    float   p_split = 0.1f; // speculative decoding split probability
+    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)

-    llama_context * ctx_tgt = nullptr;
-    llama_context * ctx_dft = nullptr;
+    // ngram-based speculative decoding

+    uint16_t ngram_size_n     = 12; // ngram size for lookup
+    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
+    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+
+    std::shared_ptr<common_ngram_mod> ngram_mod;
+
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+
+    // draft-model speculative decoding
+
+    struct common_params_model mparams_dft;
+
+    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+
+    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+
+    int32_t n_ctx        = 0;  // draft context size
    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)

    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

-    common_cpu_params cpuparams;
-    common_cpu_params cpuparams_batch;
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;

    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

+    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
-};
-
-struct common_params_speculative_ngram_mod {
-    int32_t n_match = 24;
-
-    int32_t n_max = 64;
-    int32_t n_min = 48;
-};
-
-struct common_params_speculative_ngram_map {
-    uint16_t size_n   = 12; // ngram size for lookup
-    uint16_t size_m   = 48; // mgram size for speculative tokens
-    uint16_t min_hits = 1;  // minimum hits at ngram/mgram lookup for mgram to be proposed
-};
-
-struct common_params_speculative_ngram_cache {
-    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
-};
-
-struct common_params_speculative {
-    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
-
-    // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
-    common_params_speculative_draft draft;
-
-    common_params_speculative_ngram_mod ngram_mod;
-    common_params_speculative_ngram_map ngram_simple;
-    common_params_speculative_ngram_map ngram_map_k;
-    common_params_speculative_ngram_map ngram_map_k4v;
-
-    common_params_speculative_ngram_cache ngram_cache;

    bool has_dft() const {
-        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
-    }
-
-    uint32_t need_n_rs_seq() const {
-        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
-        });
-
-        return needs_rs_seq ? draft.n_max : 0u;
+        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
    }
 };

 struct common_params_vocoder {
    struct common_params_model model;

-    std::string speaker_file; // speaker file path
+    std::string speaker_file = ""; // speaker file path                                      // NOLINT

-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

 struct common_params_diffusion {
@@ -445,20 +389,19 @@ struct common_params {
    // offload params
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

-    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
-    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
-    bool    fit_params_print   = false; // print the estimated required memory to run the model
-    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
+    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
+    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use

    // margin per device in bytes for fitting parameters to free memory:
    std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

-    common_cpu_params cpuparams;
-    common_cpu_params cpuparams_batch;
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
@@ -586,15 +529,13 @@ struct common_params {

    // server params
    int32_t port                = 8080;          // server listens on this network port
-    bool    reuse_port          = false;         // allow multiple sockets to bind to the same port
    int32_t timeout_read        = 600;           // http read timeout in seconds
    int32_t timeout_write       = timeout_read;  // http write timeout in seconds
    int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
    bool    cache_prompt        = true;  // whether to enable prompt caching
-    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
-    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
+    int32_t n_ctx_checkpoints   = 32;     // max number of context checkpoints per slot
+    int32_t checkpoint_every_nt = 8192;   // make a checkpoint every n tokens during prefill
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
@@ -603,9 +544,10 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = true;                                                                                  // NOLINT
    bool enable_chat_template = true;
-    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
+    int reasoning_budget = -1;
+    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

@@ -616,25 +558,16 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

-    // UI configs
-    bool ui = true;
-
-    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
-    bool webui = ui;
+    // webui configs
+    bool webui = true;
    bool webui_mcp_proxy = false;
    std::string webui_config_json;

-    bool ui_mcp_proxy = false;
-    std::string ui_config_json;
-
    // "advanced" endpoints are disabled by default for better security
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

-    // enable built-in tools
-    std::vector<std::string> server_tools;
-
    // router server configs
    std::string models_dir    = ""; // directory containing models for the router server
    std::string models_preset = ""; // directory containing model presets for the router server
@@ -697,19 +630,17 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
-    bool no_alloc = false; // Don't allocate model buffers
 };

 // call once at the start of a program if it uses libcommon
 // initializes the logging system and prints info about the build
 void common_init();

-void common_params_print_info(const common_params & params, bool print_devices = true);
 std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);

 //
@@ -731,7 +662,6 @@ std::string string_format(const char * fmt, ...);

 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
-std::string string_lcs(std::string_view a, std::string_view b);

 std::string string_join(const std::vector<std::string> & values, const std::string & separator);
 std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
@@ -778,11 +708,6 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

-// remove when moving to c++20
-inline bool string_starts_with(std::string_view str, char prefix) {
-    return !str.empty() && str.front() == prefix;
-}
-
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
@@ -820,8 +745,6 @@ std::string string_from(const std::vector<int> & values);
 std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
 std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);

-bool glob_match(const std::string & pattern, const std::string & str);
-
 //
 // Filesystem utils
 //
@@ -856,7 +779,7 @@ struct common_sampler;

 // note: defines the model, context, samplers, ets. lifetimes
 struct common_init_result {
-    common_init_result(common_params & params, bool model_only = false);
+    common_init_result(common_params & params);
    ~common_init_result();

    llama_model * model();
@@ -874,37 +797,16 @@ private:

 using common_init_result_ptr = std::unique_ptr<common_init_result>;

-common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);
+common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

-// model endpoint from env
-std::string common_get_model_endpoint();
-
-//
-// Context utils
-//
-
-enum common_context_seq_rm_type {
-    COMMON_CONTEXT_SEQ_RM_TYPE_NO           = 0, // seq_rm not supported (e.g. no memory module)
-    COMMON_CONTEXT_SEQ_RM_TYPE_PART         = 1, // can seq_rm partial sequences
-    COMMON_CONTEXT_SEQ_RM_TYPE_FULL         = 2, // can seq_rm full sequences only
-    COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq
-};
-
-// check if the llama_context can remove sequences
-// note: clears the memory of the context
-common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
-
-// aborts execution on failure
-void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
-void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
-void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
+std::string                   get_model_endpoint();

 //
 // Batch utils
@@ -1043,50 +945,3 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std

 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
-
-//
-// prompt utils
-//
-
-struct common_prompt_checkpoint {
-    int64_t n_tokens;
-
-    llama_pos pos_min;
-    llama_pos pos_max;
-
-    std::vector<uint8_t> data_tgt;
-    std::vector<uint8_t> data_dft;
-
-    size_t size() const;
-
-    bool empty() const;
-    void clear();
-
-    void update_pos(
-            int64_t n_tokens,
-            llama_pos pos_min,
-            llama_pos pos_max);
-
-    void update_tgt(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags);
-
-    void update_dft(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags);
-
-    void load_tgt(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags) const;
-
-    void load_dft(
-            llama_context * ctx,
-            llama_seq_id seq_id,
-            llama_state_seq_flags flags) const;
-
-    void clear_tgt();
-    void clear_dft();
-};
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -700,13 +700,13 @@ namespace console {
        std::vector<std::string> entries;
        size_t viewing_idx = SIZE_MAX;
        std::string backup_line; // current line before viewing history
-        void add(std::string_view line) {
+        void add(const std::string & line) {
            if (line.empty()) {
                return;
            }
            // avoid duplicates with the last entry
            if (entries.empty() || entries.back() != line) {
-                entries.emplace_back(line);
+                entries.push_back(line);
            }
            // also clear viewing state
            end_viewing();
@@ -1031,12 +1031,11 @@ namespace console {

        if (!end_of_stream && !line.empty()) {
            // remove the trailing newline for history storage
-            std::string_view hline = line;
            if (!line.empty() && line.back() == '\n') {
-                hline.remove_suffix(1);
+                line.pop_back();
            }
            // TODO: maybe support multiline history entries?
-            history.add(hline);
+            history.add(line);
        }

        fflush(out);
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -1,38 +1,9 @@
 #include "debug.h"

-#include "common.h"
 #include "log.h"

 #include <cmath>
-#include <regex>
 #include <string>
-#include <vector>
-
-struct common_debug_cb_user_data::impl {
-    std::vector<uint8_t>    data;
-    std::vector<std::regex> tensor_filters;
-    bool                    abort_on_nan{false};
-};
-
-common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
-common_debug_cb_user_data::~common_debug_cb_user_data() = default;
-
-common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
-    : pimpl(std::make_unique<impl>())
-{
-    for (const auto & pattern : filter_patterns) {
-        try {
-            std::string anchored_pattern = "^" + pattern;
-            pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
-        } catch (const std::regex_error & e) {
-            throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
-        }
-    }
-    pimpl->abort_on_nan = abort_on_nan;
-
-    params.cb_eval           = common_debug_cb_eval;
-    params.cb_eval_user_data = this;
-}

 static std::string common_ggml_ne_string(const ggml_tensor * t) {
    std::string str;
@@ -76,7 +47,8 @@ static float common_ggml_get_float_value(const uint8_t * data,

 #define INDENT "    "

-static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
+template <bool abort>
+void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
@@ -122,7 +94,7 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
        LOG(INDENT "sum = %f\n", sum);
    }

-    if (abort_on_nan) {
+    if constexpr (abort) {
        if (std::isnan(sum)) {
            LOG("encountered NaN - aborting\n");
            exit(0);
@@ -140,9 +112,8 @@ static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int6
 * @param user_data user data to pass at each call back
 * @return true to receive data or continue the graph, false otherwise
 */
-bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
-    auto * cb_data = (common_debug_cb_user_data *) user_data;
-    auto * pimpl = cb_data->pimpl.get();
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (base_callback_data *) user_data;

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
@@ -151,10 +122,10 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
        return true;  // Always retrieve data
    }

-    bool matches_filter = pimpl->tensor_filters.empty();
+    bool matches_filter = cb_data->tensor_filters.empty();

    if (!matches_filter) {
-        for (const auto & filter : pimpl->tensor_filters) {
+        for (const auto & filter : cb_data->tensor_filters) {
            if (std::regex_search(t->name, filter)) {
                matches_filter = true;
                break;
@@ -177,14 +148,20 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {

    if (!is_host) {
        auto n_bytes = ggml_nbytes(t);
-        pimpl->data.resize(n_bytes);
-        ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
    }

    if (!ggml_is_quantized(t->type) && matches_filter) {
-        uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
-        common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
    }

    return true;
 }
+
+// Explicit template instantiations
+template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
+template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
+template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
+template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
--- a/common/debug.h
+++ b/common/debug.h
@@ -1,31 +1,43 @@
 #pragma once
-
-#include <memory>
+#include "common.h"
 #include <string>
 #include <vector>
+#include <regex>

 // common debug functions and structs

-struct common_params;
+// Print a tensor's detailed data
+// data - the tensor's data in byte format
+// type - the tensor's quantization type
+// ne   - the tensor dimensions array
+// nb   - the tensor strides array
+// n    - the number of rows/columns to fully print
+template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);

 // Intended to use as callback for ggml_backend_sched_eval_callback
 // prints tensors that are processed in the computation graph
-// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
-// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
-// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
+// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
+// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
 // in a tensor (useful for stopping debug sessions on first erroneous tensor)
 // The callback data will be passed as the third parameter (user_data)
-bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
+struct base_callback_data {
+    std::vector<uint8_t>    data;
+    std::vector<std::regex> tensor_filters;

-struct common_debug_cb_user_data {
-    struct impl;
-    std::unique_ptr<impl> pimpl;
+    base_callback_data() = default;

-    common_debug_cb_user_data();
-    ~common_debug_cb_user_data();
-
-    common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
-    common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
-
-    common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
+    base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
+        for (const auto & pattern : filter_patterns) {
+            try {
+                std::string anchored_pattern = "^" + pattern;
+                tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
+            } catch (const std::regex_error & e) {
+                throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
+            }
+        }
+        params.cb_eval           = common_debug_cb_eval<false>;
+        params.cb_eval_user_data = this;
+    }
 };
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -1,10 +1,9 @@
 #include "arg.h"

-#include "build-info.h"
 #include "common.h"
+#include "gguf.h" // for reading GGUF splits
 #include "log.h"
 #include "download.h"
-#include "hf-cache.h"

 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
@@ -16,7 +15,6 @@
 #include <map>
 #include <mutex>
 #include <regex>
-#include <unordered_set>
 #include <string>
 #include <thread>
 #include <vector>
@@ -37,6 +35,8 @@
 #endif
 #endif

+#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
 // isatty
 #if defined(_WIN32)
 #include <io.h>
@@ -51,6 +51,31 @@ using json = nlohmann::ordered_json;
 //

 // validate repo name format: owner/repo
+static bool validate_repo_name(const std::string & repo) {
+    static const std::regex repo_regex(R"(^[A-Za-z0-9_.\-]+\/[A-Za-z0-9_.\-]+$)");
+    return std::regex_match(repo, repo_regex);
+}
+
+static std::string get_manifest_path(const std::string & repo, const std::string & tag) {
+    // we use "=" to avoid clashing with other component, while still being allowed on windows
+    std::string fname = "manifest=" + repo + "=" + tag + ".json";
+    if (!validate_repo_name(repo)) {
+        throw std::runtime_error("error: repo name must be in the format 'owner/repo'");
+    }
+    string_replace_all(fname, "/", "=");
+    return fs_get_cache_file(fname);
+}
+
+static std::string read_file(const std::string & fname) {
+    std::ifstream file(fname);
+    if (!file) {
+        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
+    }
+    std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    file.close();
+    return content;
+}
+
 static void write_file(const std::string & fname, const std::string & content) {
    const std::string fname_tmp = fname + ".tmp";
    std::ofstream     file(fname_tmp);
@@ -107,7 +132,7 @@ static bool is_http_status_ok(int status) {

 std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag) {
    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "";
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
    std::string hf_repo = parts[0];
    if (string_split<std::string>(hf_repo, '/').size() != 2) {
        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
@@ -115,14 +140,11 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
    return {hf_repo, tag};
 }

-class ProgressBar : public common_download_callback {
+class ProgressBar {
    static inline std::mutex mutex;
    static inline std::map<const ProgressBar *, int> lines;
    static inline int max_line = 0;

-    std::string filename;
-    size_t len = 0;
-
    static void cleanup(const ProgressBar * line) {
        lines.erase(line);
        if (lines.empty()) {
@@ -141,33 +163,17 @@ class ProgressBar : public common_download_callback {
 public:
    ProgressBar() = default;

-    void on_start(const common_download_progress & p) override {
-        filename = p.url;
-
-        if (auto pos = filename.rfind('/'); pos != std::string::npos) {
-            filename = filename.substr(pos + 1);
-        }
-        if (auto pos = filename.find('?'); pos != std::string::npos) {
-            filename = filename.substr(0, pos);
-        }
-        for (size_t i = 0; i < filename.size(); ++i) {
-            if ((filename[i] & 0xC0) != 0x80) {
-                if (len++ == 39) {
-                    filename.resize(i);
-                    filename += "…";
-                    break;
-                }
-            }
-        }
-    }
-
-    void on_done(const common_download_progress &, bool) override {
+    ~ProgressBar() {
        std::lock_guard<std::mutex> lock(mutex);
        cleanup(this);
    }

-    void on_update(const common_download_progress & p) override {
-        if (!p.total || !is_output_a_tty()) {
+    void update(size_t current, size_t total) {
+        if (!is_output_a_tty()) {
+            return;
+        }
+
+        if (!total) {
            return;
        }

@@ -179,27 +185,28 @@ public:
        }
        int lines_up = max_line - lines[this];

-        size_t bar = (55 - len) * 2;
-        size_t pct = (100 * p.downloaded) / p.total;
-        size_t pos = (bar * p.downloaded) / p.total;
+        size_t width = 50;
+        size_t pct = (100 * current) / total;
+        size_t pos = (width * current) / total;
+
+        std::cout << "\033[s";

        if (lines_up > 0) {
            std::cout << "\033[" << lines_up << "A";
        }
-        std::cout << '\r' << "Downloading " << filename << " ";
+        std::cout << "\033[2K\r["
+            << std::string(pos, '=')
+            << (pos < width ? ">" : "")
+            << std::string(width - pos, ' ')
+            << "] " << std::setw(3) << pct << "%  ("
+            << current / (1024 * 1024) << " MB / "
+            << total / (1024 * 1024) << " MB) "
+            << "\033[u";

-        for (size_t i = 0; i < bar; i += 2) {
-            std::cout << (i + 1 < pos ? "─" : (i < pos ? "╴" : " "));
-        }
-        std::cout << std::setw(4) << pct << "%\033[K";
+        std::cout.flush();

-        if (lines_up > 0) {
-            std::cout << "\033[" << lines_up << "B";
-        }
-        std::cout << '\r' << std::flush;
-
-        if (p.downloaded == p.total) {
-            cleanup(this);
+        if (current == total) {
+             cleanup(this);
        }
    }

@@ -211,8 +218,8 @@ static bool common_pull_file(httplib::Client & cli,
                             const std::string & resolve_path,
                             const std::string & path_tmp,
                             bool supports_ranges,
-                             common_download_progress & p,
-                             common_download_callback * callback) {
+                             size_t existing_size,
+                             size_t & total_size) {
    std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
    if (!ofs.is_open()) {
        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
@@ -220,27 +227,29 @@ static bool common_pull_file(httplib::Client & cli,
    }

    httplib::Headers headers;
-    if (supports_ranges && p.downloaded > 0) {
-        headers.emplace("Range", "bytes=" + std::to_string(p.downloaded) + "-");
+    if (supports_ranges && existing_size > 0) {
+        headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
    }

    const char * func = __func__; // avoid __func__ inside a lambda
+    size_t downloaded = existing_size;
    size_t progress_step = 0;
+    ProgressBar bar;

    auto res = cli.Get(resolve_path, headers,
        [&](const httplib::Response &response) {
-            if (p.downloaded > 0 && response.status != 206) {
+            if (existing_size > 0 && response.status != 206) {
                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", func, response.status);
                return false;
            }
-            if (p.downloaded == 0 && response.status != 200) {
+            if (existing_size == 0 && response.status != 200) {
                LOG_WRN("%s: download received non-successful status code: %d\n", func, response.status);
                return false;
            }
-            if (p.total == 0 && response.has_header("Content-Length")) {
+            if (total_size == 0 && response.has_header("Content-Length")) {
                try {
                    size_t content_length = std::stoull(response.get_header_value("Content-Length"));
-                    p.total = p.downloaded + content_length;
+                    total_size = existing_size + content_length;
                } catch (const std::exception &e) {
                    LOG_WRN("%s: invalid Content-Length header: %s\n", func, e.what());
                }
@@ -253,16 +262,11 @@ static bool common_pull_file(httplib::Client & cli,
                LOG_ERR("%s: error writing to file: %s\n", func, path_tmp.c_str());
                return false;
            }
-            p.downloaded += len;
+            downloaded += len;
            progress_step += len;

-            if (progress_step >= p.total / 1000 || p.downloaded == p.total) {
-                if (callback) {
-                    callback->on_update(p);
-                    if (callback->is_cancelled()) {
-                        return false;
-                    }
-                }
+            if (progress_step >= total_size / 1000 || downloaded == total_size) {
+                bar.update(downloaded, total_size);
                progress_step = 0;
            }
            return true;
@@ -283,46 +287,41 @@ static bool common_pull_file(httplib::Client & cli,

 // download one single file from remote URL to local path
 // returns status code or -1 on error
-static int common_download_file_single_online(const std::string & url,
-                                              const std::string & path,
-                                              const common_download_opts & opts,
-                                              bool skip_etag) {
+static int common_download_file_single_online(const std::string        & url,
+                                              const std::string        & path,
+                                              const std::string        & bearer_token,
+                                              const common_header_list & custom_headers) {
    static const int max_attempts        = 3;
    static const int retry_delay_seconds = 2;

-    const bool file_exists = std::filesystem::exists(path);
-
-    if (file_exists && skip_etag) {
-        LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str());
-        return 304; // 304 Not Modified - fake cached response
-    }
-
    auto [cli, parts] = common_http_client(url);

    httplib::Headers headers;
-    for (const auto & h : opts.headers) {
+    for (const auto & h : custom_headers) {
        headers.emplace(h.first, h.second);
    }
    if (headers.find("User-Agent") == headers.end()) {
-        headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
+        headers.emplace("User-Agent", "llama-cpp/" + build_info);
    }
-    if (!opts.bearer_token.empty()) {
-        headers.emplace("Authorization", "Bearer " + opts.bearer_token);
+    if (!bearer_token.empty()) {
+        headers.emplace("Authorization", "Bearer " + bearer_token);
    }
    cli.set_default_headers(headers);

+    const bool file_exists = std::filesystem::exists(path);
+
    std::string last_etag;
    if (file_exists) {
        last_etag = read_etag(path);
    } else {
-        LOG_DBG("%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
    }

    auto head = cli.Head(parts.path);
    if (!head || head->status < 200 || head->status >= 300) {
-        LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
+        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
        if (file_exists) {
-            LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
+            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        return head ? head->status : -1;
@@ -333,11 +332,10 @@ static int common_download_file_single_online(const std::string & url,
        etag = head->get_header_value("ETag");
    }

-    common_download_progress p;
-    p.url = url;
+    size_t total_size = 0;
    if (head->has_header("Content-Length")) {
        try {
-            p.total = std::stoull(head->get_header_value("Content-Length"));
+            total_size = std::stoull(head->get_header_value("Content-Length"));
        } catch (const std::exception& e) {
            LOG_WRN("%s: invalid Content-Length in HEAD response: %s\n", __func__, e.what());
        }
@@ -350,11 +348,11 @@ static int common_download_file_single_online(const std::string & url,

    if (file_exists) {
        if (etag.empty()) {
-            LOG_DBG("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
+            LOG_INF("%s: using cached file (no server etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (!last_etag.empty() && last_etag == etag) {
-            LOG_DBG("%s: using cached file (same etag): %s\n", __func__, path.c_str());
+            LOG_INF("%s: using cached file (same etag): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        if (remove(path.c_str()) != 0) {
@@ -363,23 +361,10 @@ static int common_download_file_single_online(const std::string & url,
        }
    }

-    { // silent
-        std::error_code ec;
-        std::filesystem::create_directories(std::filesystem::path(path).parent_path(), ec);
-    }
-
-    bool success = false;
    const std::string path_temporary = path + ".downloadInProgress";
    int delay = retry_delay_seconds;

-    if (opts.callback) {
-        opts.callback->on_start(p);
-    }
-
    for (int i = 0; i < max_attempts; ++i) {
-        if (opts.callback && opts.callback->is_cancelled()) {
-            break;
-        }
        if (i) {
            LOG_WRN("%s: retrying after %d seconds...\n", __func__, delay);
            std::this_thread::sleep_for(std::chrono::seconds(delay));
@@ -393,44 +378,28 @@ static int common_download_file_single_online(const std::string & url,
                existing_size = std::filesystem::file_size(path_temporary);
            } else if (remove(path_temporary.c_str()) != 0) {
                LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                break;
+                return -1;
            }
        }

-        p.downloaded = existing_size;
-
-        LOG_DBG("%s: downloading from %s to %s (etag:%s)...\n",
+        LOG_INF("%s: downloading from %s to %s (etag:%s)...\n",
                __func__, common_http_show_masked_url(parts).c_str(),
                path_temporary.c_str(), etag.c_str());

-        if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, p, opts.callback)) {
+        if (common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size)) {
            if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-                break;
+                return -1;
            }
-            if (!etag.empty() && !skip_etag) {
+            if (!etag.empty()) {
                write_etag(path, etag);
            }
-            success = true;
-            break;
+            return head->status;
        }
    }

-    if (opts.callback) {
-        opts.callback->on_done(p, success);
-    }
-    if (opts.callback && opts.callback->is_cancelled() &&
-        std::filesystem::exists(path_temporary)) {
-        if (remove(path_temporary.c_str()) != 0) {
-            LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, path_temporary.c_str());
-        }
-    }
-    if (!success) {
-        LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
-        return -1; // max attempts reached
-    }
-
-    return head->status;
+    LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
+    return -1; // max attempts reached
 }

 std::pair<long, std::vector<char>> common_remote_get_content(const std::string          & url,
@@ -442,7 +411,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
        headers.emplace(h.first, h.second);
    }
    if (headers.find("User-Agent") == headers.end()) {
-        headers.emplace("User-Agent", "llama-cpp/" + std::string(llama_build_info()));
+        headers.emplace("User-Agent", "llama-cpp/" + build_info);
    }

    if (params.timeout > 0) {
@@ -469,15 +438,11 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string

 int common_download_file_single(const std::string & url,
                                const std::string & path,
-                                const common_download_opts & opts,
-                                bool skip_etag) {
-    if (!opts.offline) {
-        ProgressBar tty_cb;
-        common_download_opts online_opts = opts;
-        if (!online_opts.callback) {
-            online_opts.callback = &tty_cb;
-        }
-        return common_download_file_single_online(url, path, online_opts, skip_etag);
+                                const std::string & bearer_token,
+                                bool offline,
+                                const common_header_list & headers) {
+    if (!offline) {
+        return common_download_file_single_online(url, path, bearer_token, headers);
    }

    if (!std::filesystem::exists(path)) {
@@ -485,361 +450,197 @@ int common_download_file_single(const std::string & url,
        return -1;
    }

-    LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
-
-    // notify the callback that the file was cached
-    if (opts.callback) {
-        common_download_progress p;
-        p.url = url;
-        p.cached = true;
-        opts.callback->on_start(p);
-        opts.callback->on_done(p, true);
-    }
-
+    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
    return 304; // Not Modified - fake cached response
 }

-struct gguf_split_info {
-    std::string prefix; // tag included
-    std::string tag;
-    int index;
-    int count;
-};
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls,
+                                          const std::string & bearer_token,
+                                          bool offline,
+                                          const common_header_list & headers) {
+    // Prepare download in parallel
+    std::vector<std::future<bool>> futures_download;
+    futures_download.reserve(urls.size());

-static gguf_split_info get_gguf_split_info(const std::string & path) {
-    static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
-    static const std::regex re_tag("[-.]([A-Z0-9_]+)$", std::regex::icase);
-    std::smatch m;
-
-    std::string prefix = path;
-    if (!string_remove_suffix(prefix, ".gguf")) {
-        return {};
+    for (auto const & item : urls) {
+        futures_download.push_back(
+            std::async(
+                std::launch::async,
+                [&bearer_token, offline, &headers](const std::pair<std::string, std::string> & it) -> bool {
+                    const int http_status = common_download_file_single(it.first, it.second, bearer_token, offline, headers);
+                    return is_http_status_ok(http_status);
+                },
+                item
+            )
+        );
    }

-    int index = 1;
-    int count = 1;
-
-    if (std::regex_match(prefix, m, re_split)) {
-        index = std::stoi(m[2].str());
-        count = std::stoi(m[3].str());
-        prefix = m[1].str();
-    }
-
-    std::string tag;
-    if (std::regex_search(prefix, m, re_tag)) {
-        tag = m[1].str();
-        for (char & c : tag) {
-            c = std::toupper((unsigned char)c);
+    // Wait for all downloads to complete
+    for (auto & f : futures_download) {
+        if (!f.get()) {
+            return false;
        }
    }

-    return {std::move(prefix), std::move(tag), index, count};
+    return true;
 }

-// Q4_0 -> 4, F16 -> 16, NVFP4 -> 4, Q8_K_M -> 8, etc
-static int extract_quant_bits(const std::string & filename) {
-    auto split = get_gguf_split_info(filename);
-
-    auto pos = split.tag.find_first_of("0123456789");
-    if (pos == std::string::npos) {
-        return 0;
-    }
-
-    return std::stoi(split.tag.substr(pos));
-}
-
-static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
-                                          const hf_cache::hf_file  & file) {
-    auto split = get_gguf_split_info(file.path);
-
-    if (split.count <= 1) {
-        return {file};
-    }
-    hf_cache::hf_files result;
-
-    for (const auto & f : files) {
-        auto split_f = get_gguf_split_info(f.path);
-        if (split_f.count == split.count && split_f.prefix == split.prefix) {
-            result.push_back(f);
-        }
-    }
-    return result;
-}
-
-// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"),
-// preferring deeper shared directory prefix with the model, then closest quantization
-static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
-                                           const std::string        & model,
-                                           const std::string        & keyword) {
-    hf_cache::hf_file best;
-    size_t best_depth = 0;
-    int best_diff = 0;
-    bool found = false;
-
-    auto model_bits = extract_quant_bits(model);
-    auto model_parts = string_split<std::string>(model, '/');
-    auto model_dir = model_parts.end() - 1;
-
-    for (const auto & f : files) {
-        if (!string_ends_with(f.path, ".gguf") ||
-            f.path.find(keyword) == std::string::npos) {
-            continue;
-        }
-
-        auto sib_parts = string_split<std::string>(f.path, '/');
-        auto sib_dir = sib_parts.end() - 1;
-
-        auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
-                                      sib_parts.begin(), sib_dir);
-        if (dir != sib_dir) {
-            continue;
-        }
-
-        size_t depth = dir - sib_parts.begin();
-        auto bits = extract_quant_bits(f.path);
-        auto diff = std::abs(bits - model_bits);
-
-        if (!found || depth > best_depth || (depth == best_depth && diff < best_diff)) {
-            best = f;
-            best_depth = depth;
-            best_diff = diff;
-            found = true;
-        }
-    }
-    return best;
-}
-
-static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
-                                          const std::string        & model) {
-    return find_best_sibling(files, model, "mmproj");
-}
-
-static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files,
-                                       const std::string        & model) {
-    return find_best_sibling(files, model, "mtp-");
-}
-
-static bool gguf_filename_is_model(const std::string & filepath) {
-    if (!string_ends_with(filepath, ".gguf")) {
+bool common_download_model(const common_params_model & model,
+                           const std::string & bearer_token,
+                           bool offline,
+                           const common_header_list & headers) {
+    // Basic validation of the model.url
+    if (model.url.empty()) {
+        LOG_ERR("%s: invalid model url\n", __func__);
        return false;
    }

-    std::string filename = filepath;
-    if (auto pos = filename.rfind('/'); pos != std::string::npos) {
-        filename = filename.substr(pos + 1);
+    const int http_status = common_download_file_single(model.url, model.path, bearer_token, offline, headers);
+    if (!is_http_status_ok(http_status)) {
+        return false;
    }

-    return filename.find("mmproj")  == std::string::npos &&
-           filename.find("imatrix") == std::string::npos &&
-           filename.find("mtp-")    == std::string::npos;
-}
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+            return false;
+        }

-static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
-                                         const std::string        & tag) {
-    std::vector<std::string> tags;
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }

-    if (!tag.empty()) {
-        tags.push_back(tag);
-    } else {
-        tags = {"Q4_K_M", "Q8_0"};
+        gguf_free(ctx_gguf);
    }

-    for (const auto & t : tags) {
-        std::regex pattern(t + "[.-]", std::regex::icase);
-        for (const auto & f : files) {
-            if (gguf_filename_is_model(f.path) &&
-                std::regex_search(f.path, pattern)) {
-                auto split = get_gguf_split_info(f.path);
-                if (split.count > 1 && split.index != 1) {
-                    continue;
-                }
-                return f;
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+                return false;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
+                return false;
            }
        }
-    }

-    // fallback to first available model only if tag is empty
-    if (tag.empty()) {
-        for (const auto & f : files) {
-            if (gguf_filename_is_model(f.path)) {
-                auto split = get_gguf_split_info(f.path);
-                if (split.count > 1 && split.index != 1) {
-                    continue;
-                }
-                return f;
+        std::vector<std::pair<std::string, std::string>> urls;
+        for (int idx = 1; idx < n_split; idx++) {
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+
+            char split_url[LLAMA_MAX_URL_LENGTH] = {0};
+            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
+
+            if (std::string(split_path) == model.path) {
+                continue; // skip the already downloaded file
            }
+
+            urls.push_back({split_url, split_path});
        }
+
+        // Download in parallel
+        common_download_file_multiple(urls, bearer_token, offline, headers);
    }

-    return {};
+    return true;
 }

-static void list_available_gguf_files(const hf_cache::hf_files & files) {
-    LOG_INF("Available GGUF files:\n");
-    for (const auto & f : files) {
-        if (string_ends_with(f.path, ".gguf")) {
-            LOG_INF(" - %s\n", f.path.c_str());
+common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag,
+                                      const std::string & bearer_token,
+                                      bool offline,
+                                      const common_header_list & custom_headers) {
+    // the returned hf_repo is without tag
+    auto [hf_repo, tag] = common_download_split_repo_tag(hf_repo_with_tag);
+
+    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
+
+    // headers
+    common_header_list headers = custom_headers;
+    headers.push_back({"Accept", "application/json"});
+    if (!bearer_token.empty()) {
+        headers.push_back({"Authorization", "Bearer " + bearer_token});
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    // User-Agent header is already set in common_remote_get_content, no need to set it here
+
+    // make the request
+    common_remote_params params;
+    params.headers = headers;
+    long res_code = 0;
+    std::string res_str;
+    bool use_cache = false;
+    std::string cached_response_path = get_manifest_path(hf_repo, tag);
+    if (!offline) {
+        try {
+            auto res = common_remote_get_content(url, params);
+            res_code = res.first;
+            res_str = std::string(res.second.data(), res.second.size());
+        } catch (const std::exception & e) {
+            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
        }
    }
-}
-
-struct hf_plan {
-    hf_cache::hf_file primary;
-    hf_cache::hf_files model_files;
-    hf_cache::hf_file mmproj;
-    hf_cache::hf_file mtp;
-};
-
-static hf_plan get_hf_plan(const common_params_model  & model,
-                           const common_download_opts & opts,
-                           bool download_mmproj,
-                           bool download_mtp) {
-    hf_plan plan;
-    hf_cache::hf_files all;
-
-    auto [repo, tag] = common_download_split_repo_tag(model.hf_repo);
-
-    if (!opts.offline) {
-        all = hf_cache::get_repo_files(repo, opts.bearer_token);
-    }
-    if (all.empty()) {
-        all = hf_cache::get_cached_files(repo);
-    }
-    if (all.empty()) {
-        return plan;
+    if (res_code == 0) {
+        if (std::filesystem::exists(cached_response_path)) {
+            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
+            res_str = read_file(cached_response_path);
+            res_code = 200;
+            use_cache = true;
+        } else {
+            throw std::runtime_error(
+                offline ? "error: failed to get manifest (offline mode)"
+                : "error: failed to get manifest (check your internet connection)");
+        }
    }
+    std::string ggufFile;
+    std::string mmprojFile;

-    hf_cache::hf_file primary;
+    if (res_code == 200 || res_code == 304) {
+        try {
+            auto j = json::parse(res_str);

-    if (!model.hf_file.empty()) {
-        for (const auto & f : all) {
-            if (f.path == model.hf_file) {
-                primary = f;
-                break;
+            if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
+                ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
            }
-        }
-        if (primary.path.empty()) {
-            LOG_ERR("%s: file '%s' not found in repository\n", __func__, model.hf_file.c_str());
-            list_available_gguf_files(all);
-            return plan;
-        }
-    } else {
-        primary = find_best_model(all, tag);
-        if (primary.path.empty()) {
-            LOG_ERR("%s: no GGUF files found in repository %s\n", __func__, repo.c_str());
-            list_available_gguf_files(all);
-            return plan;
-        }
-    }
-
-    plan.primary = primary;
-    plan.model_files = get_split_files(all, primary);
-
-    if (download_mmproj) {
-        plan.mmproj = find_best_mmproj(all, primary.path);
-    }
-
-    if (download_mtp) {
-        plan.mtp = find_best_mtp(all, primary.path);
-    }
-
-    return plan;
-}
-
-struct download_task {
-    std::string url;
-    std::string path;
-};
-
-static std::vector<download_task> get_url_tasks(const common_params_model & model) {
-    auto split = get_gguf_split_info(model.url);
-
-    if (split.count <= 1) {
-        return {{model.url, model.path}};
-    }
-
-    auto filename = split.prefix;
-    if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) {
-        filename = split.prefix.substr(pos + 1);
-    }
-
-    auto parent_path = std::filesystem::path(model.path).parent_path();
-    auto prefix_path = (parent_path / filename).string();
-
-    std::vector<download_task> tasks;
-    for (int i = 1; i <= split.count; i++) {
-        auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count);
-        tasks.push_back({split.prefix + suffix, prefix_path + suffix});
-    }
-    return tasks;
-}
-
-common_download_model_result common_download_model(const common_params_model  & model,
-                                                   const common_download_opts & opts,
-                                                   bool download_mmproj,
-                                                   bool download_mtp) {
-    common_download_model_result result;
-    std::vector<download_task> tasks;
-    hf_plan hf;
-
-    bool is_hf = !model.hf_repo.empty();
-
-    if (is_hf) {
-        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
-        for (const auto & f : hf.model_files) {
-            tasks.push_back({f.url, f.local_path});
-        }
-        if (!hf.mmproj.path.empty()) {
-            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
-        }
-        if (!hf.mtp.path.empty()) {
-            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
-        }
-    } else if (!model.url.empty()) {
-        tasks = get_url_tasks(model);
-    } else {
-        result.model_path = model.path;
-        return result;
-    }
-
-    if (tasks.empty()) {
-        return result;
-    }
-
-    std::vector<std::future<bool>> futures;
-    for (const auto & task : tasks) {
-        futures.push_back(std::async(std::launch::async,
-            [&task, &opts, is_hf]() {
-                int status = common_download_file_single(task.url, task.path, opts, is_hf);
-                return is_http_status_ok(status);
+            if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
+                mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
            }
-        ));
-    }
-
-    for (auto & f : futures) {
-        if (!f.get()) {
-            return {};
+        } catch (const std::exception & e) {
+            throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
        }
-    }
-
-    if (is_hf) {
-        for (const auto & f : hf.model_files) {
-            hf_cache::finalize_file(f);
-        }
-        result.model_path = hf.primary.final_path;
-
-        if (!hf.mmproj.path.empty()) {
-            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
-        }
-
-        if (!hf.mtp.path.empty()) {
-            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+        if (!use_cache) {
+            // if not using cached response, update the cache file
+            write_file(cached_response_path, res_str);
        }
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
    } else {
-        result.model_path = model.path;
+        throw std::runtime_error(string_format("error from HF API (%s), response code: %ld, data: %s", url.c_str(), res_code, res_str.c_str()));
    }

-    return result;
+    // check response
+    if (ggufFile.empty()) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+
+    return { hf_repo, ggufFile, mmprojFile };
 }

 //
@@ -950,9 +751,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
        std::string local_path = fs_get_cache_file(model_filename);

        const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
-        common_download_opts opts;
-        opts.bearer_token = token;
-        const int http_status = common_download_file_single(blob_url, local_path, opts);
+        const int http_status = common_download_file_single(blob_url, local_path, token, false, {});
        if (!is_http_status_ok(http_status)) {
            throw std::runtime_error("Failed to download Docker Model");
        }
@@ -966,22 +765,28 @@ std::string common_docker_resolve_model(const std::string & docker) {
 }

 std::vector<common_cached_model_info> common_list_cached_models() {
-    std::unordered_set<std::string> seen;
-    std::vector<common_cached_model_info> result;
-
-    auto files = hf_cache::get_cached_files();
-
-    for (const auto & f : files) {
-        auto split = get_gguf_split_info(f.path);
-        if (split.index != 1 || split.tag.empty() ||
-            split.prefix.find("mmproj") != std::string::npos ||
-            split.prefix.find("mtp-")   != std::string::npos) {
-            continue;
-        }
-        if (seen.insert(f.repo_id + ":" + split.tag).second) {
-            result.push_back({f.repo_id, split.tag});
+    std::vector<common_cached_model_info> models;
+    const std::string cache_dir = fs_get_cache_directory();
+    const std::vector<common_file_info> files = fs_list(cache_dir, false);
+    for (const auto & file : files) {
+        if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
+            common_cached_model_info model_info;
+            model_info.manifest_path = file.path;
+            std::string fname = file.name;
+            string_replace_all(fname, ".json", ""); // remove extension
+            auto parts = string_split<std::string>(fname, '=');
+            if (parts.size() == 4) {
+                // expect format: manifest=<user>=<model>=<tag>=<other>
+                model_info.user  = parts[1];
+                model_info.model = parts[2];
+                model_info.tag   = parts[3];
+            } else {
+                // invalid format
+                continue;
+            }
+            model_info.size = 0; // TODO: get GGUF size, not manifest size
+            models.push_back(model_info);
        }
    }
-
-    return result;
+    return models;
 }
--- a/common/download.h
+++ b/common/download.h
@@ -8,22 +8,6 @@ struct common_params_model;
 using common_header      = std::pair<std::string, std::string>;
 using common_header_list = std::vector<common_header>;

-struct common_download_progress {
-    std::string url;
-    size_t downloaded = 0;
-    size_t total      = 0;
-    bool cached       = false;
-};
-
-class common_download_callback {
-public:
-    virtual ~common_download_callback() = default;
-    virtual void on_start(const common_download_progress & p) = 0;
-    virtual void on_update(const common_download_progress & p) = 0;
-    virtual void on_done(const common_download_progress & p, bool ok) = 0;
-    virtual bool is_cancelled() const { return false; }
-};
-
 struct common_remote_params {
    common_header_list headers;
    long timeout  = 0;           // in seconds, 0 means no timeout
@@ -33,65 +17,55 @@ struct common_remote_params {
 // get remote file content, returns <http_code, raw_response_body>
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

-// split HF repo with tag into <repo, tag>, for example:
-// - "ggml-org/models:F16" -> <"ggml-org/models", "F16">
-// tag is optional and can be empty
+// split HF repo with tag into <repo, tag>
+// for example: "user/model:tag" -> <"user/model", "tag">
+// if tag is not present, default to "latest"
+// example: "user/model" -> <"user/model", "latest">
 std::pair<std::string, std::string> common_download_split_repo_tag(const std::string & hf_repo_with_tag);

-// Result of common_list_cached_models
 struct common_cached_model_info {
-    std::string repo;
+    std::string manifest_path;
+    std::string user;
+    std::string model;
    std::string tag;
+    size_t      size = 0; // GGUF size in bytes
+    // return string representation like "user/model:tag"
+    // if tag is "latest", it will be omitted
    std::string to_string() const {
-        return repo + ":" + tag;
+        return user + "/" + model + (tag == "latest" ? "" : ":" + tag);
    }
 };

-// Options for common_download_model and common_download_file_single
-struct common_download_opts {
-    std::string bearer_token;
-    common_header_list headers;
-    bool offline = false;
-    common_download_callback * callback = nullptr;
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
 };

-// Result of common_download_model
-struct common_download_model_result {
-    std::string model_path;
-    std::string mmproj_path;
-    std::string mtp_path;
-};
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+common_hf_file_res common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & bearer_token,
+    bool offline,
+    const common_header_list & headers = {}
+);

-// Download model from HuggingFace repo or URL
-//
-// input (via model struct):
-// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag
-// - model.hf_file: specific file in the repo (requires hf_repo)
-// - model.url: simple download (used if hf_repo is empty)
-// - model.path: local file path
-//
-// tag matching (for HF repos without model.hf_file):
-// - if tag is specified, searches for GGUF matching that quantization
-// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF
-//
-// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically
-// detected and all parts are downloaded
-//
-// caching:
-// - HF repos: uses HuggingFace cache
-// - URLs: uses ETag-based caching
-//
-// when opts.offline=true, no network requests are made
-// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
-// then with the closest quantization bits
-// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
-//
-// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
-common_download_model_result common_download_model(
+// returns true if download succeeded
+bool common_download_model(
    const common_params_model & model,
-    const common_download_opts & opts = {},
-    bool download_mmproj = false,
-    bool download_mtp    = false
+    const std::string & bearer_token,
+    bool offline,
+    const common_header_list & headers = {}
 );

 // returns list of cached models
@@ -99,11 +73,11 @@ std::vector<common_cached_model_info> common_list_cached_models();

 // download single file from url to local path
 // returns status code or -1 on error
-// skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash)
 int common_download_file_single(const std::string & url,
                                const std::string & path,
-                                const common_download_opts & opts = {},
-                                bool skip_etag = false);
+                                const std::string & bearer_token,
+                                bool offline,
+                                const common_header_list & headers = {});

 // resolve and download model from Docker registry
 // return local path to downloaded model file
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
arthw	2985be3324	update hw info	2026-03-31 09:24:40 +08:00
arthw	8dc96153c3	enhance FA stable in UT	2026-03-17 15:57:02 +08:00