arg: Add LLAMA_ARG_API_KEY_FILE environment variable for --api-key-file (#23167 )

test-llama-archs: fix table format [no release] (#23810 )
ggml: auto apply iGPU flag CUDA/HIP if integrated device (#23007 )
2026-05-28 17:27:26 +03:00 · 2026-05-28 16:25:40 +02:00 · 2026-05-28 15:53:54 +02:00 · 2026-05-28 15:01:14 +02:00 · 2026-05-28 14:51:14 +02:00 · 2026-05-28 14:50:25 +02:00
1130 changed files with 44407 additions and 36995 deletions
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -5,6 +5,9 @@
 # Define the CANN base image for easier version updates later
 ARG CHIP_TYPE=910b
 ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.5.0-${CHIP_TYPE}-openeuler24.03-py3.11
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A

 # ==============================================================================
 # BUILD STAGE
@@ -55,6 +58,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full && \
    cp build/bin/* /app/full/ && \
    cp *.py /app/full/ && \
+    cp -r conversion /app/full/ && \
    cp -r gguf-py /app/full/ && \
    cp -r requirements /app/full/ && \
    cp requirements.txt /app/full/
@@ -67,6 +71,19 @@ RUN mkdir -p /app/full && \
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 # -- Install runtime dependencies --
 RUN yum install -y libgomp curl && \
    yum clean all && \
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -1,4 +1,7 @@
 ARG UBUNTU_VERSION=24.04
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -27,6 +30,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -35,6 +39,19 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -6,6 +6,10 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER

 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # CUDA architecture to build for (defaults to all supported archs)
@@ -32,6 +36,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -40,6 +45,19 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,7 @@
 ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A

 ## Build Image

@@ -33,6 +36,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -40,6 +44,19 @@ RUN mkdir -p /app/full \

 FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 ARG IGC_VERSION=v2.20.5
 ARG IGC_VERSION_FULL=2_2.20.5+19972
 ARG COMPUTE_RUNTIME_VERSION=25.40.35563.10
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@@ -1,4 +1,7 @@
 ARG ASCEND_VERSION=8.5.0-910b-openeuler22.03-py3.10
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A

 FROM ascendai/cann:$ASCEND_VERSION AS build

@@ -28,6 +31,20 @@ RUN echo "Building with static libs" && \

 # TODO: use image with NNRT
 FROM ascendai/cann:$ASCEND_VERSION AS runtime
+
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion /

 ENV LC_ALL=C.utf8
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -6,6 +6,10 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V

 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

 # MUSA architecture to build for (defaults to all supported archs)
@@ -37,6 +41,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -45,6 +50,19 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@@ -18,6 +18,10 @@ ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2
 ARG http_proxy=
 ARG https_proxy=

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+
 ## Build Image
 FROM ubuntu:${UBUNTU_VERSION} AS build

@@ -77,6 +81,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/ReleaseOV/bin/* /app/full/ \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -88,6 +93,18 @@ FROM ubuntu:${UBUNTU_VERSION} AS base
 # Pass proxy args to runtime stage
 ARG http_proxy
 ARG https_proxy
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE

 RUN apt-get update \
    && apt-get install -y libgomp1 libtbb12 curl wget ocl-icd-libopencl1 \
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -7,6 +7,10 @@ ARG AMDGPU_VERSION=7.2.1
 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build

@@ -49,6 +53,7 @@ RUN mkdir -p /app/lib \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -57,6 +62,19 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 RUN apt-get update \
    && apt-get install -y libgomp1 curl \
    && apt autoremove -y \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -1,5 +1,8 @@
 ARG GCC_VERSION=15.2.0
 ARG UBUNTU_VERSION=24.04
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A

 ### Build Llama.cpp stage
 FROM gcc:${GCC_VERSION} AS build
@@ -34,6 +37,7 @@ RUN --mount=type=cache,target=/root/.ccache \

 COPY *.py             /opt/llama.cpp/bin
 COPY .devops/tools.sh /opt/llama.cpp/bin
+COPY conversion       /opt/llama.cpp/conversion

 COPY gguf-py          /opt/llama.cpp/gguf-py
 COPY requirements.txt /opt/llama.cpp/gguf-py
@@ -44,14 +48,28 @@ COPY requirements     /opt/llama.cpp/gguf-py/requirements
 FROM scratch AS collector

 # Copy llama.cpp binaries and libraries
-COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
-COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
-COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+COPY --from=build /opt/llama.cpp/bin        /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib        /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py    /llama.cpp/gguf-py
+COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion


 ### Base image
 FROM ubuntu:${UBUNTU_VERSION} AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
    apt update -y && \
@@ -91,6 +109,7 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- -y

 COPY --from=collector /llama.cpp/bin /app
 COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+COPY --from=collector /llama.cpp/conversion /app/conversion

 RUN pip install --no-cache-dir --break-system-packages \
        -r /app/gguf-py/requirements.txt
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,4 +1,7 @@
 ARG UBUNTU_VERSION=26.04
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A

 FROM ubuntu:$UBUNTU_VERSION AS build

@@ -23,6 +26,7 @@ RUN mkdir -p /app/lib && \
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
+    && cp -r conversion /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
@@ -31,6 +35,19 @@ RUN mkdir -p /app/full \
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base

+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
 RUN apt-get update \
    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
    libglvnd0 libgl1 libglx0 libegl1 libgles2 \
--- a/.devops/zendnn.Dockerfile
+++ b/.devops/zendnn.Dockerfile
@@ -0,0 +1,101 @@
+ARG UBUNTU_VERSION=24.04
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+
+FROM ubuntu:$UBUNTU_VERSION AS build
+
+RUN apt-get update && \
+    apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates
+
+ENV CC=gcc-13 CXX=g++-13
+
+WORKDIR /app
+
+COPY . .
+
+RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \
+    cmake --build build -j $(nproc)
+
+RUN mkdir -p /app/lib && \
+    find build -name "*.so*" -exec cp -P {} /app/lib \;
+
+RUN mkdir -p /app/full \
+    && cp build/bin/* /app/full \
+    && cp *.py /app/full \
+    && cp -r conversion /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base image
+FROM ubuntu:$UBUNTU_VERSION AS base
+
+ARG BUILD_DATE=N/A
+ARG APP_VERSION=N/A
+ARG APP_REVISION=N/A
+ARG IMAGE_URL=https://github.com/ggml-org/llama.cpp
+ARG IMAGE_SOURCE=https://github.com/ggml-org/llama.cpp
+LABEL org.opencontainers.image.created=$BUILD_DATE \
+      org.opencontainers.image.version=$APP_VERSION \
+      org.opencontainers.image.revision=$APP_REVISION \
+      org.opencontainers.image.title="llama.cpp" \
+      org.opencontainers.image.description="LLM inference in C/C++" \
+      org.opencontainers.image.url=$IMAGE_URL \
+      org.opencontainers.image.source=$IMAGE_SOURCE
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 libnuma1 curl \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app
+
+### Full
+FROM base AS full
+
+COPY --from=build /app/full /app
+
+WORKDIR /app
+
+RUN apt-get update \
+    && apt-get install -y \
+    git \
+    python3 \
+    python3-pip \
+    python3-wheel \
+    && pip install --break-system-packages --upgrade setuptools \
+    && pip install --break-system-packages -r requirements.txt \
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+ENTRYPOINT ["/app/tools.sh"]
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/full/llama-completion /app
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
--- a/.editorconfig
+++ b/.editorconfig
@@ -45,15 +45,7 @@ insert_final_newline = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset

-[tools/server/webui/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
-
-[tools/server/public/**]
+[tools/ui/**]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,4 +0,0 @@
-# Treat the generated single-file WebUI build as binary for diff purposes.
-# Git's pack-file delta compression still works (byte-level), but this prevents
-# git diff from printing the entire minified file on every change.
-tools/server/public/index.html -diff
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@@ -100,8 +100,8 @@ body:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
-          For very long logs (thousands of lines), preferably upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
+          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
      value: |
        <details>
        <summary>Logs</summary>
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@@ -88,8 +88,8 @@ body:
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
          If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well.
-          For very long logs (thousands of lines), please upload them as files instead.
-          On Linux you can redirect console output into a file by appending ` > llama.log 2>&1` to your command.
+          For very long logs (thousands of lines), please upload them as files instead; the `--log-file` CLI argument can be used for this purpose.
+          On Linux you can alternatively redirect the console output of any command into a file by appending ` > llama.log 2>&1` to your command.
      value: |
        <details>
        <summary>Logs</summary>
--- a/.github/actions/linux-setup-spacemit/action.yml
+++ b/.github/actions/linux-setup-spacemit/action.yml
@@ -15,6 +15,6 @@ runs:
      id: setup
      uses: ./.github/actions/unarchive-tar
      with:
-        url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
+        url: https://github.com/spacemit-com/toolchain/releases/download/v${{ inputs.version }}/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
        path: ${{ inputs.path }}
        strip: 1
--- a/.github/actions/unarchive-tar/action.yml
+++ b/.github/actions/unarchive-tar/action.yml
@@ -24,4 +24,4 @@ runs:
      run: |
        mkdir -p ${{ inputs.path }}
        cd ${{ inputs.path }}
-        curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
+        curl --no-progress-meter -L ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
--- a/.github/actions/windows-setup-cuda/action.yml
+++ b/.github/actions/windows-setup-cuda/action.yml
@@ -96,3 +96,34 @@ runs:
          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V13_1=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+
+    - name: Install Cuda Toolkit 13.3
+      if: ${{ inputs.cuda_version == '13.3' }}
+      shell: pwsh
+      run: |
+          mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
+          choco install unzip -y
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_crt/windows-x86_64/cuda_crt-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-13.3.29-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-13.5.1.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libnvvm/windows-x86_64/libnvvm-windows-x86_64-13.3.33-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-13.3.29-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-13.3.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-13.3.27-archive.zip"
+          curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cccl/windows-x86_64/cccl-windows-x86_64-13.3.3.3.1-archive.zip"
+          unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3"
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_crt-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_cudart-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvcc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvrtc-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libcublas-windows-x86_64-13.5.1.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\libnvvm-windows-x86_64-13.3.33-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_nvtx-windows-x86_64-13.3.29-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cuda_profiler_api-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\visual_studio_integration-windows-x86_64-13.3.27-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\cccl-windows-x86_64-13.3.3.3.1-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" /E /I /H /Y
+          echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
+          echo "CUDA_PATH_V13_3=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.3" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -73,11 +73,10 @@ android:
    - changed-files:
        - any-glob-to-any-file:
            - examples/llama.android/**
-server/webui:
+server/ui:
    - changed-files:
        - any-glob-to-any-file:
-            - tools/server/webui/**
-            - tools/server/public/**
+            - tools/ui/**
 server:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-3rd-party.yml
+++ b/.github/workflows/build-3rd-party.yml
@@ -22,9 +22,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  ubuntu-24-llguidance:
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -31,7 +31,7 @@ jobs:
  android-ndk-snapdragon:
    runs-on: ubuntu-latest
    container:
-      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.7'
    defaults:
      run:
        shell: bash
@@ -58,14 +58,45 @@ jobs:
          name: llama-cpp-android-arm64-snapdragon
          path: pkg-snapdragon/llama.cpp

+  linux-iot-snapdragon:
+    runs-on: ubuntu-latest
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-linux:v0.7'
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false
+
+      - name: Build Llama.CPP for Snapdragon Linux IoT
+        id: build_llama_cpp_snapdragon_linux
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+          cmake --preset arm64-linux-snapdragon-release -B build-snapdragon -DGGML_OPENCL=ON
+          cmake --build build-snapdragon -j $(nproc)
+          cmake --install build-snapdragon --prefix pkg-snapdragon/llama.cpp
+
+      - name: Upload Llama.CPP Snapdragon Linux IoT Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_snapdragon_linux.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-linux-arm64-snapdragon
+          path: pkg-snapdragon/llama.cpp
+
  test-snapdragon-qdc:
-    name: Test on QDC Android Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon]
-    runs-on: ubuntu-slim
+    name: Test on QDC Device (${{ matrix.device }})
+    needs: [android-ndk-snapdragon, linux-iot-snapdragon]
+    runs-on: ubuntu-24.04-arm
+    timeout-minutes: 90
    strategy:
      fail-fast: false
      matrix:
-        device: [SM8750, SM8650, SM8850]
+        device: [SM8750, SM8850, QCS9075M]

    steps:
      - name: Checkout
@@ -74,11 +105,11 @@ jobs:
      - name: Download build artifact
        uses: actions/download-artifact@v7
        with:
-          name: llama-cpp-android-arm64-snapdragon
+          name: ${{ startsWith(matrix.device, 'QCS') && 'llama-cpp-linux-arm64-snapdragon' || 'llama-cpp-android-arm64-snapdragon' }}
          path: pkg-snapdragon/llama.cpp

      - name: Set up Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
        with:
          python-version: '3.x'
          cache: pip
@@ -107,7 +138,8 @@ jobs:
              --test       all \
              --pkg-dir    pkg-snapdragon/llama.cpp \
              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
-              --device     ${{ matrix.device }}
+              --device     ${{ matrix.device }} \
+              ${{ startsWith(matrix.device, 'QCS') && '--retries 2 --retry-delay 300' || '' }}
        env:
          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}

--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -27,12 +27,12 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  android:
+  default:
    runs-on: ubuntu-latest

    steps:
@@ -58,7 +58,7 @@ jobs:
          cd examples/llama.android
          ./gradlew build --no-daemon

-  android-ndk:
+  ndk:
    runs-on: ubuntu-latest
    container:
      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
@@ -73,6 +73,11 @@ jobs:
          fetch-depth: 0
          lfs: false

+      - name: Dependencies
+        run: |
+          apt-get update
+          apt-get install -y build-essential
+
      - name: Build
        id: ndk_build
        run: |
@@ -86,3 +91,59 @@ jobs:
        with:
          name: llama-cpp-android-arm64-cpu
          path: pkg-adb/llama.cpp
+
+  arm64:
+    runs-on: ubuntu-latest
+
+    env:
+      NDK_VERSION: "29.0.14206865"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # note : disabled to spare some cache space (https://github.com/ggml-org/llama.cpp/pull/23789)
+      #        for some reason, the ccache does not improve the build time in this case
+      # example:
+      #   cache off: https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78160400831
+      #   cache on:  https://github.com/ggerganov/tmp2/actions/runs/26534713799/job/78224189394
+      #
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@v1.2.21
+      #  with:
+      #    key: android-ubuntu-arm64
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          java-version: 17
+          distribution: temurin
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@40fd30fb8d7440372e1316f5d1809ec01dcd3699 # v4.0.1
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Install NDK
+        run: |
+          sdkmanager "ndk;${{ env.NDK_VERSION }}"
+          echo "ANDROID_NDK=${ANDROID_SDK_ROOT}/ndk/${{ env.NDK_VERSION }}" >> $GITHUB_ENV
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=arm64-v8a \
+            -DANDROID_PLATFORM=android-28 \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -32,12 +32,12 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  macOS-latest-ios:
+  macos-latest-arm64:
    runs-on: macos-latest

    steps:
@@ -48,7 +48,80 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-ios
+          key: apple-arm64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=OFF \
+            -DGGML_METAL_SHADER_DEBUG=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main -E "test-llama-archs" --verbose --timeout 900
+
+  macos-latest-x64:
+    runs-on: macos-15-intel
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: apple-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -B build \
+            -DCMAKE_BUILD_RPATH="@loader_path" \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_BUILD_BORINGSSL=ON \
+            -DGGML_METAL=OFF \
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  macos-latest-ios:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # TODO: this likely does not do anything - if yes, remove it
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: apple-ios
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -59,6 +132,7 @@ jobs:
          cmake -B build -G Xcode \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_COMMON=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
@@ -89,6 +163,7 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -115,7 +190,7 @@ jobs:
          xcodebuild -downloadPlatform iOS
          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

-  macOS-latest-tvos:
+  macos-latest-tvos:
    runs-on: macos-latest

    steps:
@@ -123,10 +198,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      # TODO: this likely does not do anything - if yes, remove it
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-tvos
+          key: apple-tvos
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -138,6 +214,7 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
+            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -147,7 +224,7 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-  macOS-latest-visionos:
+  macos-latest-visionos:
    runs-on: macos-latest

    steps:
@@ -155,6 +232,14 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      # TODO: this likely does not do anything - if yes, remove it
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: apple-visionos
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
      - name: Build
        id: cmake_build
        run: |
@@ -163,6 +248,7 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_COMMON=OFF \
+            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@@ -172,7 +258,7 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-  macOS-latest-swift:
+  macos-latest-swift:
    runs-on: macos-latest
    needs: macos-latest-ios-xcode

@@ -185,10 +271,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

+      # TODO: this likely does not do anything - if yes, remove it
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: macOS-latest-swift
+          key: apple-swift
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -206,6 +293,7 @@ jobs:
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TOOLS=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -28,7 +28,7 @@ jobs:
        id: cache-sdk
        with:
          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}

      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
@@ -54,7 +54,7 @@ jobs:
  #      id: cache-toolchain
  #      with:
  #        path: ./spacemit_toolchain
-  #        key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+  #        key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}

  #    - name: Setup SpacemiT Toolchain
  #      if: steps.cache-toolchain.outputs.cache-hit != 'true'
@@ -81,7 +81,7 @@ jobs:
        id: cache-openvino
        with:
          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}

      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
@@ -108,7 +108,7 @@ jobs:
        id: cache-rocm
        with:
          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: Setup ROCm
        if: steps.cache-rocm.outputs.cache-hit != 'true'
--- a/.github/workflows/build-cann.yml
+++ b/.github/workflows/build-cann.yml
@@ -29,74 +29,76 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  openEuler-latest-cann:
-    defaults:
-      run:
-        shell: bash -el {0}
-    strategy:
-      matrix:
-        arch: [x86, aarch64]
-        chip_type: ['910b', '310p']
-        build: ['Release']
-        use_acl_graph: ['on', 'off']
-        exclude:
-          # 310P does not support USE_ACL_GRAPH=on
-          - chip_type: '310p'
-            use_acl_graph: 'on'
-    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v6
-        with:
-          fetch-depth: 0
-
-      - name: Free up disk space
-        uses: ggml-org/free-disk-space@v1.3.1
-        with:
-          tool-cache: true
-
-      - name: Set container image
-        id: cann-image
-        run: |
-          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
-          echo "image=${image}" >> "${GITHUB_OUTPUT}"
-
-      - name: Pull container image
-        run: docker pull "${{ steps.cann-image.outputs.image }}"
-
-      - name: Build
-        env:
-          BUILD_TYPE: ${{ matrix.build }}
-          SOC_TYPE: ascend${{ matrix.chip_type }}
-          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
-        run: |
-          HOST_UID=$(id -u)
-          HOST_GID=$(id -g)
-
-          docker run --rm \
-            -v "${PWD}:/workspace" \
-            -w /workspace \
-            -e SOC_TYPE=${SOC_TYPE} \
-            -e BUILD_TYPE=${BUILD_TYPE} \
-            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
-            "${{ steps.cann-image.outputs.image }}" \
-            bash -lc '
-              set -e
-              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
-              yum clean all && rm -rf /var/cache/yum
-              git config --global --add safe.directory "/workspace"
-              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
-              cmake -S . -B build \
-                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-                  -DGGML_CANN=on \
-                  -DSOC_TYPE=${SOC_TYPE} \
-                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
-              cmake --build build -j $(nproc)
-
-              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
-            '
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  openEuler-latest-cann:
+#    defaults:
+#      run:
+#        shell: bash -el {0}
+#    strategy:
+#      matrix:
+#        arch: [x86, aarch64]
+#        chip_type: ['910b', '310p']
+#        build: ['Release']
+#        use_acl_graph: ['on', 'off']
+#        exclude:
+#          # 310P does not support USE_ACL_GRAPH=on
+#          - chip_type: '310p'
+#            use_acl_graph: 'on'
+#    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+#    steps:
+#      - name: Checkout
+#        uses: actions/checkout@v6
+#        with:
+#          fetch-depth: 0
+#
+#      - name: Free up disk space
+#        uses: ggml-org/free-disk-space@v1.3.1
+#        with:
+#          tool-cache: true
+#
+#      - name: Set container image
+#        id: cann-image
+#        run: |
+#          image="ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.5.0-910b-openeuler24.03-py3.11' || '8.5.0-310p-openeuler24.03-py3.11' }}"
+#          echo "image=${image}" >> "${GITHUB_OUTPUT}"
+#
+#      - name: Pull container image
+#        run: docker pull "${{ steps.cann-image.outputs.image }}"
+#
+#      - name: Build
+#        env:
+#          BUILD_TYPE: ${{ matrix.build }}
+#          SOC_TYPE: ascend${{ matrix.chip_type }}
+#          USE_ACL_GRAPH: ${{ matrix.use_acl_graph }}
+#        run: |
+#          HOST_UID=$(id -u)
+#          HOST_GID=$(id -g)
+#
+#          docker run --rm \
+#            -v "${PWD}:/workspace" \
+#            -w /workspace \
+#            -e SOC_TYPE=${SOC_TYPE} \
+#            -e BUILD_TYPE=${BUILD_TYPE} \
+#            -e USE_ACL_GRAPH=${USE_ACL_GRAPH} \
+#            "${{ steps.cann-image.outputs.image }}" \
+#            bash -lc '
+#              set -e
+#              yum install -y --setopt=install_weak_deps=False --setopt=tsflags=nodocs git gcc gcc-c++ make cmake openssl-devel
+#              yum clean all && rm -rf /var/cache/yum
+#              git config --global --add safe.directory "/workspace"
+#              export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+#              cmake -S . -B build \
+#                  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+#                  -DGGML_CANN=on \
+#                  -DSOC_TYPE=${SOC_TYPE} \
+#                  -DUSE_ACL_GRAPH=${USE_ACL_GRAPH}
+#              cmake --build build -j $(nproc)
+#
+#              chown -R '"${HOST_UID}"':'"${HOST_GID}"' /workspace/build
+#            '
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -5,23 +5,23 @@ on:

 jobs:
  linux:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, Linux, CPU]
    steps:
      - uses: actions/checkout@v6
        with:
          fetch-depth: 0

-      - name: Install dependencies
-        run: |
-          sudo apt update
-          sudo apt install -y build-essential tcl cmake
-
      - name: Build
        run: |
          PREFIX="$(pwd)"/inst
-          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
+          cmake -S . -B build \
+                -DCMAKE_PREFIX_PATH="$PREFIX" \
+                -DLLAMA_OPENSSL=OFF \
+                -DLLAMA_BUILD_TESTS=OFF \
+                -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_BUILD_EXAMPLES=OFF \
+                -DLLAMA_BUILD_APP=OFF \
+                -DCMAKE_BUILD_TYPE=Release
          cmake --build build --config Release
          cmake --install build --prefix "$PREFIX" --config Release

--- a/.github/workflows/build-cpu.yml
+++ b/.github/workflows/build-cpu.yml
@@ -0,0 +1,231 @@
+name: CI (cpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-cpu.yml',
+      '.github/workflows/build-cmake-pkg.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.swift',
+      '**/*.m',
+      '**/*.metal',
+      '**/*.comp',
+      '**/*.glsl',
+      '**/*.wgsl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-cpu.yml',
+      '.github/workflows/build-cmake-pkg.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh',
+      '**/*.swift',
+      '**/*.m',
+      '**/*.metal',
+      '**/*.comp',
+      '**/*.glsl',
+      '**/*.wgsl'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  build-cmake-pkg:
+    uses: ./.github/workflows/build-cmake-pkg.yml
+
+  ubuntu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-22.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cpu-${{ matrix.os }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        if: ${{ contains(matrix.os, 'ubuntu-24.04') }}
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  windows:
+    runs-on: windows-2025
+
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      SDE_VERSION: 9.33.0-2024-01-07
+      VULKAN_VERSION: 1.4.313.2
+
+    strategy:
+      matrix:
+        include:
+          - build: 'x64-cpu-static'
+            arch: 'x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF'
+          - build: 'x64-openblas'
+            arch: 'x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+          - build: 'x64-vulkan'
+            arch: 'x64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
+          - build: 'arm64'
+            arch: 'arm64'
+            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cpu-windows-2025-${{ matrix.build }}
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'x64-openblas' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'x64-vulkan' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build ${{ matrix.defines }} `
+            -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'x64-openblas' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
+
+      - name: Test
+        id: cmake_test
+        if: ${{ matrix.arch == 'x64' }}
+        run: |
+          cd build
+          ctest -L main -C Release --verbose --timeout 900
+
+      # TODO: disabled for now, consider adding tests for all CPU variants instead
+      # - name: Test (Intel SDE)
+      #   id: cmake_test_sde
+      #   if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+      #   run: |
+      #     curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
+      #     # for some weird reason windows tar doesn't like sde tar.xz
+      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
+      #     7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
+      #     $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
+      #     cd build
+      #     $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
+      #     & $sde -future -- ctest -L main -C Release --verbose --timeout 900
--- a/.github/workflows/build-cross.yml
+++ b/.github/workflows/build-cross.yml
@@ -277,7 +277,7 @@ jobs:

    env:
      # Make sure this is in sync with build-cache.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.2.4"

    steps:
      - uses: actions/checkout@v6
@@ -287,7 +287,7 @@ jobs:
      #  id: cache-toolchain
      #  with:
      #    path: ./spacemit_toolchain
-      #    key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+      #    key: cache-gha-spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}

      - name: Setup SpacemiT Toolchain
        #if: steps.cache-toolchain.outputs.cache-hit != 'true'
--- a/.github/workflows/build-cuda-ubuntu.yml
+++ b/.github/workflows/build-cuda-ubuntu.yml
@@ -0,0 +1,134 @@
+name: CI (CUDA, ubuntu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-cuda-ubuntu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cu',
+      '**/*.cuh'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-cuda-ubuntu.yml',
+      'ggml/src/ggml-cuda/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  cuda:
+    runs-on: ubuntu-24.04
+    container: nvidia/cuda:12.6.2-devel-ubuntu24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Install dependencies
+        env:
+          DEBIAN_FRONTEND: noninteractive
+        run: |
+          apt update
+          apt install -y cmake build-essential ninja-build libgomp1 git libssl-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cuda-ubuntu-24.04-cuda
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with CMake
+        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+        run: |
+          cmake -S . -B build -G Ninja \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_CUDA_ARCHITECTURES=89-real \
+            -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CUDA=ON \
+            -DGGML_CUDA_CUB_3DOT2=ON
+          cmake --build build
+
+  hip:
+    runs-on: ubuntu-22.04
+    container: rocm/dev-ubuntu-22.04:6.1.2
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libssl-dev rocwmma-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cuda-ubuntu-22.04-hip
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake HIP support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DGGML_HIP_ROCWMMA_FATTN=ON \
+            -DGPU_TARGETS="gfx1030" \
+            -DGGML_HIP=ON
+          cmake --build build --config Release -j $(nproc)
+
+  musa:
+    runs-on: ubuntu-22.04
+    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get install -y build-essential git cmake libssl-dev
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: cuda-ubuntu-22.04-musa
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build with native CMake MUSA support
+        id: cmake_build
+        run: |
+          cmake -B build -S . \
+            -DGGML_MUSA=ON
+          time cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-cuda-windows.yml
+++ b/.github/workflows/build-cuda-windows.yml
@@ -0,0 +1,146 @@
+name: CI (CUDA, windows)
+
+# TODO: this workflow is only triggered manually because it is very heavy on the CI
+#       when we provision dedicated windows runners, we can enable it for pushes too
+# note: running this workflow manually will populate the ccache for the release builds
+#       this can be used before merging a PR to speed up the release workflow
+on:
+  workflow_dispatch: # allows manual triggering
+
+# note: this will run in queue with the release workflow
+concurrency:
+  group: release
+  queue: max
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  cuda:
+    runs-on: windows-2022
+
+    strategy:
+      matrix:
+        cuda: ['12.4', '13.3']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: release-windows-2022-x64-cuda-${{ matrix.cuda }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!
+
+      - name: Install Cuda Toolkit
+        uses: ./.github/actions/windows-setup-cuda
+        with:
+          cuda_version: ${{ matrix.cuda }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Build
+        id: cmake_build
+        shell: cmd
+        # TODO: Remove GGML_CUDA_CUB_3DOT2 flag once CCCL 3.2 is bundled within CTK and that CTK version is used in this project
+        run: |
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
+          cmake -S . -B build -G "Ninja Multi-Config" ^
+            -DLLAMA_BUILD_SERVER=ON ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_BACKEND_DL=ON ^
+            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_CUDA=ON ^
+            -DGGML_RPC=ON ^
+            -DGGML_CUDA_CUB_3DOT2=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
+          cmake --build build --config Release
+
+  hip:
+    runs-on: windows-2022
+
+    env:
+      # Make sure this is in sync with build-cache.yml
+      HIPSDK_INSTALLER_VERSION: "26.Q1"
+
+    strategy:
+      matrix:
+        include:
+          # sync with release.yml
+          - name: "radeon"
+            gpu_targets: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Grab rocWMMA package
+        id: grab_rocwmma
+        run: |
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70201-81~24.04_amd64.deb"
+          7z x rocwmma.deb
+          7z x data.tar
+
+      - name: Use ROCm Installation Cache
+        uses: actions/cache@v5
+        id: cache-rocm
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: cache-gha-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Setup ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-rocm
+        with:
+          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
+
+      - name: Verify ROCm
+        id: verify
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          # TODO: this build does not match the build in release.yml, so we use a different cache key
+          #       ideally, the builds should match, similar to the CUDA build above so that we would be able
+          #       to populate the ccache for the release with manual runs of this workflow
+          #key: release-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          key: cuda-windows-2022-x64-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}
+          append-timestamp: false # note: use this only with non-concurrent jobs!
+
+      - name: Build
+        id: cmake_build
+        run: |
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.1/include/" `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DLLAMA_BUILD_BORINGSSL=ON `
+            -DROCM_DIR="${env:HIP_PATH}" `
+            -DGGML_HIP=ON `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGPU_TARGETS="gfx1100"  `
+            -DGGML_RPC=ON
+          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
--- a/.github/workflows/build-ibm.yml
+++ b/.github/workflows/build-ibm.yml
@@ -0,0 +1,150 @@
+name: CI (ibm)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-ibm.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-ibm.yml',
+      'ggml/src/ggml-cpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-24-s390x:
+    runs-on: ubuntu-24.04-s390x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Swap Endianness
+        id: endianness
+        run: |
+          for f in models/*.gguf; do
+            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
+          done
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c (s390x)
+        id: llama2c_test_s390x
+        run: |
+          cd build
+          echo "Fetch llama2c big-endian model"
+          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
+          ./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+  ubuntu-24-ppc64le:
+    runs-on: ubuntu-24.04-ppc64le
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Build Dependencies
+        id: build_depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends \
+            python3 python3-pip python3-dev python3-wheel \
+            libjpeg-dev build-essential libssl-dev \
+            git-lfs
+
+      - name: Toolchain workaround (GCC 14)
+        run: |
+          sudo apt-get install -y gcc-14 g++-14
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Python Dependencies
+        id: python_depends
+        run: |
+          export PIP_BREAK_SYSTEM_PACKAGES="1"
+          python3 -m pip install --upgrade pip setuptools
+          pip3 install ./gguf-py
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
--- a/.github/workflows/build-msys.yml
+++ b/.github/workflows/build-msys.yml
@@ -15,9 +15,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  windows-msys2:
@@ -37,7 +37,7 @@ jobs:
      #- name: ccache
      #  uses: ggml-org/ccache-action@v1.2.16
      #  with:
-      #    key: windows-msys2
+      #    key: msys-windows-2025-x64
      #    variant: ccache
      #    evict-old-files: 1d
      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
--- a/.github/workflows/build-opencl.yml
+++ b/.github/workflows/build-opencl.yml
@@ -0,0 +1,82 @@
+name: CI (opencl)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-opencl.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.cl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-opencl.yml',
+      'ggml/src/ggml-opencl/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  windows-2025-opencl-adreno:
+    runs-on: windows-2025
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: opencl-windows-2025-x64
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Ninja
+        id: install_ninja
+        run: |
+          choco install ninja
+
+      - name: Install OpenCL Headers and Libs
+        id: install_opencl
+        run: |
+          git clone https://github.com/KhronosGroup/OpenCL-Headers
+          cd OpenCL-Headers
+          cmake -B build `
+            -DBUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_TESTING=OFF `
+            -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build --target install
+          git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+          cd OpenCL-ICD-Loader
+          cmake -B build-arm64-release `
+            -A arm64 `
+            -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
+            -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
+          cmake --build build-arm64-release --target install --config release
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -S . -B build -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
--- a/.github/workflows/build-openvino.yml
+++ b/.github/workflows/build-openvino.yml
@@ -29,9 +29,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  ubuntu-24-openvino:
@@ -67,7 +67,7 @@ jobs:
        if: runner.environment == 'github-hosted'
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-24-openvino-${{ matrix.variant }}-no-preset-v1
+          key: openvino-ubuntu-24.04-${{ matrix.variant }}-no-preset-v1
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -84,7 +84,7 @@ jobs:
        id: cache-openvino
        with:
          path: ./openvino_toolkit
-          key: openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}
+          key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }}

      - name: Setup OpenVINO Toolkit
        if: steps.cache-openvino.outputs.cache-hit != 'true'
--- a/.github/workflows/build-riscv.yml
+++ b/.github/workflows/build-riscv.yml
@@ -29,11 +29,84 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
+  ubuntu-cpu-riscv64-native:
+    runs-on: ubuntu-24.04-riscv
+
+    steps:
+      - name: Install dependencies
+        run: |
+          # Install necessary packages
+          sudo apt-get update
+          sudo apt-get install -y libssl-dev
+
+          # Set gcc-14 and g++-14 as the default compilers
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-14 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-14 100
+
+          git lfs install
+
+      - name: Check environment
+        run: |
+          uname -a
+          gcc --version
+          g++ --version
+          ldd --version
+          cmake --version
+          rustc --version
+          env
+          echo "nproc=$(nproc)"
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
+      #  with:
+      #    key: riscv-ubuntu-native
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_OPENMP=OFF \
+            -DLLAMA_BUILD_EXAMPLES=ON \
+            -DLLAMA_BUILD_TOOLS=ON \
+            -DLLAMA_BUILD_TESTS=ON \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DGGML_RPC=ON \
+            -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+            -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14
+
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Test llama2c conversion
+        id: llama2c_test
+        run: |
+          cd build
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
  ubuntu-riscv64-native-sanitizer:
    runs-on: ubuntu-24.04-riscv

@@ -62,12 +135,13 @@ jobs:
        id: checkout
        uses: actions/checkout@v6

-      - name: ccache
-        uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
-        with:
-          key: ubuntu-riscv64-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+      # note: sparing some ccache since these jobs run on dedicated runners that are not part of the organitzation
+      #- name: ccache
+      #  uses: ggml-org/ccache-action@afde29e5b5422e5da23cb1f639e8baecadeadfc3 # https://github.com/ggml-org/ccache-action/pull/1
+      #  with:
+      #    key: riscv-ubuntu-native-sanitizer-${{ matrix.sanitizer }}-${{ matrix.build_type }}
+      #    evict-old-files: 1d
+      #    save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

      - name: Build
        id: cmake_build
--- a/.github/workflows/build-rpc.yml
+++ b/.github/workflows/build-rpc.yml
@@ -0,0 +1,66 @@
+name: CI (rpc)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-rpc.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-rpc.yml',
+      'ggml/src/ggml-rpc/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  ubuntu-latest-rpc:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential libssl-dev ninja-build
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_RPC=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose
--- a/.github/workflows/build-sanitize.yml
+++ b/.github/workflows/build-sanitize.yml
@@ -22,66 +22,65 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ubuntu-latest-sanitizer:
-    runs-on: ubuntu-latest
+  ctest:
+    runs-on: [self-hosted, X64, CPU, Linux]

    continue-on-error: true

    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-latest-sanitizer-${{ matrix.sanitizer }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Dependencies
-        id: depends
+      # with UNDEFINED sanitizer, we have to build in Debug to avoid GCC 13 false-positive warnings
+      - name: Build (undefined)
+        id: cmake_build_undefined
+        if: ${{ matrix.sanitizer == 'UNDEFINED' }}
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libssl-dev
+          cmake -B build \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DLLAMA_FATAL_WARNINGS=ON \
+            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
+            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON
+
+          cmake --build build --config Debug -j $(nproc)

      - name: Build
        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
+        if: ${{ matrix.sanitizer == 'ADDRESS' }}
        run: |
          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON

-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build build --config RelWithDebInfo -j $(nproc)

      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
-            -DLLAMA_FATAL_WARNINGS=ON \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
            -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
            -DGGML_SANITIZE_${{ matrix.sanitizer }}=ON \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
            -DGGML_OPENMP=OFF

-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build build --config RelWithDebInfo -j $(nproc)

      - name: Test
        id: cmake_test
+        # skip run in Debug - very slow
+        if: ${{ matrix.sanitizer != 'UNDEFINED' }}
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L main -E tokenizer --verbose --timeout 900
--- a/.github/workflows/build-self-hosted.yml
+++ b/.github/workflows/build-self-hosted.yml
@@ -50,12 +50,12 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ggml-ci-nvidia-cuda:
+  gpu-cuda:
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -67,9 +67,9 @@ jobs:
        id: ggml-ci
        run: |
          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-nvidia-vulkan-cm:
+  gpu-vulkan-nvidia-cm:
    runs-on: [self-hosted, Linux, NVIDIA]

    steps:
@@ -81,9 +81,9 @@ jobs:
        id: ggml-ci
        run: |
          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-nvidia-vulkan-cm2:
+  gpu-vulkan-nvidia-cm2:
    runs-on: [self-hosted, Linux, NVIDIA, COOPMAT2]

    steps:
@@ -95,40 +95,39 @@ jobs:
        id: ggml-ci
        run: |
          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  # TODO: investigate slight precision issues in some operations for test-backend-ops on the WebGPU backend.
-  #ggml-ci-nvidia-webgpu:
-  #  runs-on: [self-hosted, Linux, NVIDIA]
+  gpu-webgpu-nvidia:
+    runs-on: [self-hosted, Linux, NVIDIA, X64]

-  #  steps:
-  #    - name: Clone
-  #      id: checkout
-  #      uses: actions/checkout@v6
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6

-  #    - name: Dawn Dependency
-  #      id: dawn-depends
-  #      run: |
-  #        DAWN_VERSION="v20260317.182325"
-  #        DAWN_OWNER="google"
-  #        DAWN_REPO="dawn"
-  #        DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
-  #        echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-  #        curl -L -o artifact.tar.gz \
-  #          "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
-  #        mkdir dawn
-  #        tar -xvf artifact.tar.gz -C dawn --strip-components=1
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1

-  #    - name: Test
-  #      id: ggml-ci
-  #      run: |
-  #        GG_BUILD_WEBGPU=1 \
-  #        GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
-  #        GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
-  #          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_WEBGPU=1 \
+          GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
+          GG_BUILD_WEBGPU_DAWN_DIR="$GITHUB_WORKSPACE/dawn/lib64/cmake/Dawn" \
+            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  # TODO: provision AMX-compatible machine
-  #ggml-ci-cpu-amx:
+  #cpu-amx:
  #  runs-on: [self-hosted, Linux, CPU, AMX]

  #  steps:
@@ -139,10 +138,10 @@ jobs:
  #    - name: Test
  #      id: ggml-ci
  #      run: |
-  #        bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #        bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  # TODO: provision AMD GPU machine
-  # ggml-ci-amd-vulkan:
+  # amd-vulkan:
  #   runs-on: [self-hosted, Linux, AMD]

  #   steps:
@@ -154,10 +153,10 @@ jobs:
  #       id: ggml-ci
  #       run: |
  #         vulkaninfo --summary
-  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

  # TODO: provision AMD GPU machine
-  # ggml-ci-amd-rocm:
+  # amd-rocm:
  #   runs-on: [self-hosted, Linux, AMD]

  #   steps:
@@ -169,9 +168,9 @@ jobs:
  #       id: ggml-ci
  #       run: |
  #         amd-smi static
-  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-mac-metal:
+  gpu-metal:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -184,7 +183,7 @@ jobs:
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-mac-webgpu:
+  gpu-webgpu-apple:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -211,7 +210,7 @@ jobs:
          GG_BUILD_WEBGPU=1 GG_BUILD_WEBGPU_DAWN_PREFIX="$GITHUB_WORKSPACE/dawn" \
            bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-mac-vulkan:
+  gpu-vulkan:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
@@ -225,7 +224,7 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-linux-intel-vulkan:
+  gpu-vulkan-intel-linux:
    runs-on: [self-hosted, Linux, Intel]

    steps:
@@ -241,7 +240,7 @@ jobs:
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp

-  ggml-ci-win-intel-vulkan:
+  gpu-vulkan-intel-windows:
    runs-on: [self-hosted, Windows, X64, Intel]

    steps:
@@ -262,7 +261,7 @@ jobs:
          # a valid python environment for testing
          LLAMA_FATAL_WARNINGS=OFF GG_BUILD_NINJA=1 GG_BUILD_VULKAN=1 GG_BUILD_LOW_PERF=1 ./ci/run.sh ./results/llama.cpp ./mnt/llama.cpp

-  ggml-ci-intel-openvino-gpu-low-perf:
+  cpu-openvino-low-perf:
    runs-on: [self-hosted, Linux, Intel, OpenVINO]

    concurrency:
@@ -296,4 +295,110 @@ jobs:
        id: ggml-ci
        run: |
          source ./openvino_toolkit/setupvars.sh
-          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+          GG_BUILD_OPENVINO=1 GGML_OPENVINO_DEVICE=GPU GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-any-low-perf:
+    runs-on: [self-hosted, CPU]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-any-high-perf:
+    runs-on: [self-hosted, CPU]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_HIGH_PERF=1 GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-arm64-graviton4:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+          build-essential \
+          python3-venv \
+          gpg \
+          wget \
+          time \
+          git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  cpu-arm64-graviton4-kleidiai:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+          build-essential \
+          python3-venv \
+          gpg \
+          wget \
+          time \
+          git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+            | gpg --dearmor \
+            | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+            | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_KLEIDIAI=1 \
+          GG_BUILD_EXTRA_TESTS_0=1 \
+          bash ./ci/run.sh ./tmp/results ./tmp/mnt
--- a/.github/workflows/build-sycl.yml
+++ b/.github/workflows/build-sycl.yml
@@ -29,132 +29,134 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:

-  ubuntu-24-sycl:
-    strategy:
-      matrix:
-        build: [fp32, fp16]
-        include:
-          - build: fp32
-            fp16: OFF
-          - build: fp16
-            fp16: ON
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  ubuntu-24-sycl:
+#    strategy:
+#      matrix:
+#        build: [fp32]
+#        include:
+#          - build: fp32
+#            fp16: OFF
+#
+#    runs-on: ubuntu-24.04
+#
+#    env:
+#      ONEAPI_ROOT: /opt/intel/oneapi/
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#      LEVEL_ZERO_VERSION: "1.28.2"
+#      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
+#
+#    continue-on-error: true
+#
+#    steps:
+#      - uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          cd /tmp
+#          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+#          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+#
+#      - name: Install Level Zero SDK
+#        shell: bash
+#        run: |
+#          cd /tmp
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
+#          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
+#          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
+#
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: sycl-ubuntu-24-${{ matrix.build }}
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          source /opt/intel/oneapi/setvars.sh
+#          cmake -B build \
+#            -G "Ninja" \
+#            -DCMAKE_BUILD_TYPE=Release \
+#            -DGGML_SYCL=ON \
+#            -DCMAKE_C_COMPILER=icx \
+#            -DCMAKE_CXX_COMPILER=icpx \
+#            -DLLAMA_OPENSSL=OFF \
+#            -DGGML_NATIVE=OFF \
+#            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+#          time cmake --build build --config Release -j $(nproc)

-    runs-on: ubuntu-24.04
-
-    env:
-      ONEAPI_ROOT: /opt/intel/oneapi/
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-      LEVEL_ZERO_VERSION: "1.28.2"
-      LEVEL_ZERO_UBUNTU_VERSION: "u24.04"
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          cd /tmp
-          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
-          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
-
-      - name: Install Level Zero SDK
-        shell: bash
-        run: |
-          cd /tmp
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb
-          wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb
-          sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-24-sycl-${{ matrix.build }}
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DLLAMA_OPENSSL=OFF \
-            -DGGML_NATIVE=OFF \
-            -DGGML_SYCL_F16=${{ matrix.fp16 }}
-          time cmake --build build --config Release -j $(nproc)
-
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-      ONEAPI_INSTALLER_VERSION: "2025.3.3"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: Use oneAPI Installation Cache
-        uses: actions/cache@v5
-        id: cache-sycl
-        with:
-          path: ${{ env.ONEAPI_ROOT }}
-          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Download & Install oneAPI
-        shell: bash
-        if: steps.cache-sycl.outputs.cache-hit != 'true'
-        run: |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      - name: Install Level Zero SDK
-        shell: pwsh
-        run: |
-          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
-          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
-          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
+# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705)
+#       in order to enable it again, we have to provision dedicated runners  to run it
+#  windows-latest-sycl:
+#    runs-on: windows-2022
+#
+#    defaults:
+#      run:
+#        shell: bash
+#
+#    env:
+#      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+#      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+#      LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip
+#      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+#      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v6
+#
+#      - name: Use oneAPI Installation Cache
+#        uses: actions/cache@v5
+#        id: cache-sycl
+#        with:
+#          path: ${{ env.ONEAPI_ROOT }}
+#          key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+#
+#      - name: Download & Install oneAPI
+#        shell: bash
+#        if: steps.cache-sycl.outputs.cache-hit != 'true'
+#        run: |
+#          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+#
+#      - name: Install Level Zero SDK
+#        shell: pwsh
+#        run: |
+#          Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip"
+#          Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force
+#          "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append
+#
+#      - name: ccache
+#        uses: ggml-org/ccache-action@v1.2.21
+#        with:
+#          key: sycl-windows-latest
+#          variant: ccache
+#          evict-old-files: 1d
+#          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+#
+#      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+#
+#      - name: Build
+#        id: cmake_build
+#        run:  examples/sycl/win-build-sycl.bat
--- a/.github/workflows/build-vulkan.yml
+++ b/.github/workflows/build-vulkan.yml
@@ -31,12 +31,59 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
-  ubuntu-24-vulkan-llvmpipe:
+  ubuntu:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-24.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: vulkan-${{ matrix.os }}
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y gcc-14 g++-14 build-essential glslc libvulkan-dev spirv-headers libssl-dev ninja-build
+          echo "CC=gcc-14" >> "$GITHUB_ENV"
+          echo "CXX=g++-14" >> "$GITHUB_ENV"
+
+      - name: Configure
+        id: cmake_configure
+        run: |
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_VULKAN=ON
+
+      - name: Build
+        id: cmake_build
+        run: |
+          time cmake --build build -j $(nproc)
+
+  ubuntu-llvmpipe:
    runs-on: ubuntu-24.04

    steps:
@@ -47,7 +94,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-24-vulkan-llvmpipe
+          key: vulkan-ubuntu-24.04-llvmpipe
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

@@ -68,7 +115,7 @@ jobs:
        id: cache-sdk
        with:
          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+          key: cache-gha-vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}

      - name: Setup Vulkan SDK
        if: steps.cache-sdk.outputs.cache-hit != 'true'
--- a/.github/workflows/build-webgpu.yml
+++ b/.github/workflows/build-webgpu.yml
@@ -0,0 +1,181 @@
+name: CI (webgpu)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-webgpu.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp',
+      '**/*.wgsl'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-webgpu.yml',
+      'ggml/src/ggml-webgpu/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+
+jobs:
+  macos:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: webgpu-macos-latest
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export CMAKE_PREFIX_PATH=dawn
+          cmake -B build -G "Ninja" -DCMAKE_BUILD_TYPE=Release -DGGML_WEBGPU=ON -DGGML_METAL=OFF -DGGML_BLAS=OFF
+          time cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+  ubuntu:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: webgpu-ubuntu-24.04
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          sudo apt-get update -y
+          sudo apt-get install -y build-essential mesa-vulkan-drivers \
+            libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libssl-dev
+
+      - name: Dawn Dependency
+        id: dawn-depends
+        run: |
+          sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
+          DAWN_VERSION="v20260317.182325"
+          DAWN_OWNER="google"
+          DAWN_REPO="dawn"
+          DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
+          echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          curl -L -o artifact.tar.gz \
+            "https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
+          mkdir dawn
+          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+
+      - name: Build
+        id: cmake_build
+        run: |
+          export Dawn_DIR=dawn/lib64/cmake/Dawn
+          cmake -B build \
+            -DGGML_WEBGPU=ON
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          # This is using llvmpipe and runs slower than other backends
+          # test-backend-ops is too slow on llvmpipe, skip it
+          ctest -L main -E test-backend-ops --verbose --timeout 900
+
+  ubuntu-wasm:
+    strategy:
+      matrix:
+        include:
+          - build: 'x64'
+            os: ubuntu-24.04
+          - build: 'arm64'
+            os: ubuntu-24.04-arm
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: webgpu-${{ matrix.os }}-wasm
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Install Emscripten
+        run: |
+          git clone https://github.com/emscripten-core/emsdk.git
+          cd emsdk
+          ./emsdk install latest
+          ./emsdk activate latest
+
+      - name: Fetch emdawnwebgpu
+        run: |
+          DAWN_TAG="v20260317.182325"
+          EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
+          echo "Downloading ${EMDAWN_PKG}"
+          curl -L -o emdawn.zip \
+            "https://github.com/google/dawn/releases/download/${DAWN_TAG}/${EMDAWN_PKG}"
+          unzip emdawn.zip
+
+      - name: Build WASM WebGPU
+        run: |
+          source emsdk/emsdk_env.sh
+          emcmake cmake -B build-wasm \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_WEBGPU=ON \
+            -DLLAMA_OPENSSL=OFF \
+            -DEMDAWNWEBGPU_DIR=emdawnwebgpu_pkg
+
+          time cmake --build build-wasm --config Release --target test-backend-ops -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -19,7 +19,7 @@ on:

 jobs:
  check-vendor:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]

    steps:
      - name: Checkout
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -15,7 +15,7 @@ concurrency:

 jobs:
  model-naming:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
    steps:
      - uses: actions/checkout@v6
      - name: Check model naming conventions
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -11,6 +11,11 @@ name: Publish Docker image

 on:
  workflow_dispatch: # allows manual triggering
+    inputs:
+      skip_s390x:
+        description: "Skip the s390x build target (useful for fast test runs that do not need the IBM Z runner)"
+        type: boolean
+        default: false
  schedule:
    # Rebuild daily rather than on every push because it is expensive
    - cron: '12 4 * * *'
@@ -64,6 +69,8 @@ jobs:
      - name: Generate build and merge matrices
        id: matrices
        shell: bash
+        env:
+          SKIP_S390X: ${{ inputs.skip_s390x || 'false' }}
        run: |
          set -euo pipefail

@@ -86,6 +93,11 @@ jobs:
          ]
          JSON

+          if [ "${SKIP_S390X}" = "true" ]; then
+            jq 'map(select(.platforms != "linux/s390x"))' build-matrix.json > build-matrix.json.tmp
+            mv build-matrix.json.tmp build-matrix.json
+          fi
+
          BUILD_MATRIX="$(jq -c . build-matrix.json)"
          MERGE_MATRIX="$(jq -c '
            reduce .[] as $entry ({}; .[$entry.tag] |= (
@@ -132,6 +144,7 @@ jobs:
        config: ${{ fromJSON(needs.prepare_matrices.outputs.build_matrix) }}
    steps:
      - name: Check out the repo
+        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0
@@ -187,6 +200,10 @@ jobs:
        env:
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

+      - name: Get build date
+        id: build_date
+        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
+
      - name: Free Disk Space (Ubuntu)
        if: ${{ matrix.config.free_disk_space == true }}
        uses: ggml-org/free-disk-space@v1.3.1
@@ -211,13 +228,26 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
          build-args: |
+            BUILD_DATE=${{ steps.build_date.outputs.date }}
+            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
+            APP_REVISION=${{ steps.checkout.outputs.commit }}
+            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
+            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          annotations: |
+            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
+            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
+            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
+            manifest:org.opencontainers.image.title=llama.cpp
+            manifest:org.opencontainers.image.description=LLM inference in C/C++
+            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
+            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -235,13 +265,26 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
          build-args: |
+            BUILD_DATE=${{ steps.build_date.outputs.date }}
+            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
+            APP_REVISION=${{ steps.checkout.outputs.commit }}
+            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
+            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          annotations: |
+            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
+            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
+            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
+            manifest:org.opencontainers.image.title=llama.cpp
+            manifest:org.opencontainers.image.description=LLM inference in C/C++
+            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
+            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -259,13 +302,26 @@ jobs:
        with:
          context: .
          platforms: ${{ matrix.config.platforms }}
-          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true
+          outputs: type=image,name=${{ steps.meta.outputs.image_repo }},push-by-digest=true,name-canonical=true,push=true,oci-mediatypes=true
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
          build-args: |
+            BUILD_DATE=${{ steps.build_date.outputs.date }}
+            APP_VERSION=${{ needs.create_tag.outputs.source_tag }}
+            APP_REVISION=${{ steps.checkout.outputs.commit }}
+            IMAGE_URL=${{ github.server_url }}/${{ github.repository }}
+            IMAGE_SOURCE=${{ github.server_url }}/${{ github.repository }}
            ${{ matrix.config.ubuntu_version && format('UBUNTU_VERSION={0}', matrix.config.ubuntu_version) || '' }}
            ${{ matrix.config.cuda_version && format('CUDA_VERSION={0}', matrix.config.cuda_version) || '' }}
+          annotations: |
+            manifest:org.opencontainers.image.created=${{ steps.build_date.outputs.date }}
+            manifest:org.opencontainers.image.version=${{ needs.create_tag.outputs.source_tag }}
+            manifest:org.opencontainers.image.revision=${{ steps.checkout.outputs.commit }}
+            manifest:org.opencontainers.image.title=llama.cpp
+            manifest:org.opencontainers.image.description=LLM inference in C/C++
+            manifest:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}
+            manifest:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}
          # using github experimental cache
          #cache-from: type=gha
          #cache-to: type=gha,mode=max
@@ -330,10 +386,15 @@ jobs:

    steps:
      - name: Check out the repo
+        id: checkout
        uses: actions/checkout@v6
        with:
          fetch-depth: 0

+      - name: Get build date
+        id: build_date
+        run: echo "date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")" >> $GITHUB_OUTPUT
+
      - name: Download digest metadata
        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8
        with:
@@ -361,6 +422,8 @@ jobs:
          IMAGE_REPO="ghcr.io/${REPO_OWNER}/${REPO_NAME}"
          PREFIX="${IMAGE_REPO}:"
          SRC_TAG="${{ needs.create_tag.outputs.source_tag }}"
+          BUILD_DATE="${{ steps.build_date.outputs.date }}"
+          COMMIT_SHA="${{ steps.checkout.outputs.commit }}"
          TAGS="${{ matrix.config.tag }}"
          ARCHES="${{ matrix.config.arches }}"
          DIGEST_GLOB="/tmp/digests/*.tsv"
@@ -412,11 +475,21 @@ jobs:
                  refs+=("${IMAGE_REPO}@${digest}")
              done

+              local annotations=(
+                  --annotation "index:org.opencontainers.image.created=${BUILD_DATE}"
+                  --annotation "index:org.opencontainers.image.version=${SRC_TAG}"
+                  --annotation "index:org.opencontainers.image.revision=${COMMIT_SHA}"
+                  --annotation "index:org.opencontainers.image.title=llama.cpp"
+                  --annotation "index:org.opencontainers.image.description=LLM inference in C/C++"
+                  --annotation "index:org.opencontainers.image.url=${{ github.server_url }}/${{ github.repository }}"
+                  --annotation "index:org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}"
+              )
+
              echo "Creating ${merged_tag} from ${refs[*]}"
-              docker buildx imagetools create --tag "${merged_tag}" "${refs[@]}"
+              docker buildx imagetools create "${annotations[@]}" --tag "${merged_tag}" "${refs[@]}"

              echo "Creating ${merged_versioned_tag} from ${refs[*]}"
-              docker buildx imagetools create --tag "${merged_versioned_tag}" "${refs[@]}"
+              docker buildx imagetools create "${annotations[@]}" --tag "${merged_versioned_tag}" "${refs[@]}"
          }

          for tag in $TAGS; do
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -15,7 +15,7 @@ concurrency:

 jobs:
  editorconfig:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
    steps:
      - uses: actions/checkout@v6
      - uses: editorconfig-checker/action-editorconfig-checker@840e866d93b8e032123c23bac69dece044d4d84c # v2.2.0
--- a/.github/workflows/hip-quality-check.yml
+++ b/.github/workflows/hip-quality-check.yml
@@ -28,9 +28,9 @@ concurrency:
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1

 jobs:
  ubuntu-22-hip-quality-check:
@@ -50,7 +50,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.21
        with:
-          key: ubuntu-22-hip-quality-check
+          key: hip-quality-check-ubuntu-22.04
          evict-old-files: 1d
          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}

--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -3,16 +3,16 @@ name: Check Pre-Tokenizer Hashes
 on:
    push:
        paths:
-            - 'convert_hf_to_gguf.py'
+            - 'conversion/base.py'
            - 'convert_hf_to_gguf_update.py'
    pull_request:
        paths:
-            - 'convert_hf_to_gguf.py'
+            - 'conversion/base.py'
            - 'convert_hf_to_gguf_update.py'

 jobs:
    pre-tokenizer-hashes:
-        runs-on: ubuntu-slim
+        runs-on: [self-hosted, fast]

        steps:
        - name: Checkout repository
@@ -30,16 +30,16 @@ jobs:

        - name: Update pre-tokenizer hashes
          run: |
-              cp convert_hf_to_gguf.py /tmp
+              cp conversion/base.py /tmp
              .venv/bin/python convert_hf_to_gguf_update.py --check-missing

        - name: Check if committed pre-tokenizer hashes matches generated version
          run: |
-              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
-                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
-                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
+              if ! diff -q conversion/base.py /tmp/base.py; then
+                  echo "Model pre-tokenizer hashes (in conversion/base.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
+                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated conversion/base.py along with your changes"
                  echo "Differences found:"
-                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
+                  diff conversion/base.py /tmp/base.py || true
                  exit 1
              fi
              echo "Model pre-tokenizer hashes are up to date."
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -20,7 +20,7 @@ concurrency:

 jobs:
  python-check-requirements:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, CPU, fast]
    name: check-requirements
    steps:
      - name: Check out source repository
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -21,7 +21,7 @@ concurrency:

 jobs:
  flake8-lint:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
    name: Lint
    steps:
      - name: Check out source repository
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -22,7 +22,7 @@ concurrency:

 jobs:
  python-type-check:
-    runs-on: ubuntu-slim
+    runs-on: [self-hosted, fast]
    name: python type-check
    steps:
      - name: Check out source repository
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
--- a/.github/workflows/server-sanitize.yml
+++ b/.github/workflows/server-sanitize.yml
@@ -26,10 +26,10 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -37,7 +37,7 @@ concurrency:

 jobs:
  server:
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, CPU, Linux, llama-server]

    strategy:
      matrix:
@@ -46,19 +46,19 @@ jobs:
      fail-fast: false

    steps:
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libssl-dev
+      #- name: Dependencies
+      #  id: depends
+      #  run: |
+      #    sudo apt-get update
+      #    sudo apt-get -y install \
+      #      build-essential \
+      #      xxd \
+      #      git \
+      #      cmake \
+      #      curl \
+      #      wget \
+      #      language-pack-en \
+      #      libssl-dev

      - name: Clone
        id: checkout
@@ -67,6 +67,13 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
      - name: Build
        id: cmake_build
        run: |
--- a/.github/workflows/server-self-hosted.yml
+++ b/.github/workflows/server-self-hosted.yml
@@ -29,10 +29,10 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@@ -67,6 +67,13 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
      - name: Build
        id: cmake_build
        run: |
@@ -84,42 +91,106 @@ jobs:
          export ${{ matrix.extra_args }}
          pytest -v -x -m "not slow"

-  # TODO: provision CUDA runner
-  #  server-cuda:
-  #    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
-  #
-  #    name: server-cuda (${{ matrix.wf_name }})
-  #    strategy:
-  #      matrix:
-  #        build_type: [Release]
-  #        wf_name: ["GPUx1"]
-  #        include:
-  #          - build_type: Release
-  #            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
-  #            wf_name:    "GPUx1, backend-sampling"
-  #      fail-fast: false
-  #
-  #    steps:
-  #      - name: Clone
-  #        id: checkout
-  #        uses: actions/checkout@v6
-  #        with:
-  #          fetch-depth: 0
-  #          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-  #
-  #      - name: Build
-  #        id: cmake_build
-  #        run: |
-  #          cmake -B build -DGGML_SCHED_NO_REALLOC=ON
-  #          cmake --build build --config ${{ matrix.build_type }} -j $(sysctl -n hw.logicalcpu) --target llama-server
-  #
-  #      - name: Tests
-  #        id: server_integration_tests
-  #        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
-  #        run: |
-  #          cd tools/server/tests
-  #          python3 -m venv venv
-  #          source venv/bin/activate
-  #          pip install -r requirements.txt
-  #          export ${{ matrix.extra_args }}
-  #          pytest -v -x -m "not slow"
+  server-cuda:
+    runs-on: [self-hosted, llama-server, Linux, NVIDIA]
+
+    name: server-cuda (${{ matrix.wf_name }})
+    strategy:
+      matrix:
+        build_type: [Release]
+        wf_name: ["GPUx1"]
+        include:
+          - build_type: Release
+            extra_args: "LLAMA_ARG_BACKEND_SAMPLING=1"
+            wf_name:    "GPUx1, backend-sampling"
+      fail-fast: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DGGML_CUDA=ON -DGGML_SCHED_NO_REALLOC=ON
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        run: |
+          cd tools/server/tests
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          export ${{ matrix.extra_args }}
+          pytest -v -x -m "not slow"
+
+  server-kleidiai:
+    runs-on: ah-ubuntu_22_04-c8g_8x
+
+    name: server-kleidiai (${{ matrix.wf_name }})
+    strategy:
+      matrix:
+        include:
+          - build_type: Release
+            extra_build_flags: "-DGGML_CPU_KLEIDIAI=ON"
+            extra_args: ""
+            wf_name:    "CPUx1, kleidiai"
+      fail-fast: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Dependencies
+        id: depends
+        run: |
+          set -euxo pipefail
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive NEEDRESTART_MODE=a \
+          apt-get install -y \
+           build-essential \
+           libssl-dev \
+           python3-venv \
+           gpg \
+           wget \
+           time \
+           git-lfs
+
+          git lfs install
+
+          # install the latest cmake
+          sudo install -d /usr/share/keyrings
+          wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+           | gpg --dearmor \
+           | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null
+          echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ jammy main' \
+           | sudo tee /etc/apt/sources.list.d/kitware.list
+          sudo apt-get update
+          sudo apt-get install -y cmake
+
+      - name: Build
+        id: cmake_build
+        run: |
+          cmake -B build -DGGML_SCHED_NO_REALLOC=ON ${{ matrix.extra_build_flags }}
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
+      - name: Tests
+        id: server_integration_tests
+        if: ${{ (!matrix.disabled_on_pr || !github.event.pull_request) }}
+        run: |
+          cd tools/server/tests
+          python3 -m venv venv
+          source venv/bin/activate
+          pip install -r requirements.txt
+          export ${{ matrix.extra_args }}
+          pytest -v -x -m "not slow"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -44,20 +44,20 @@ on:
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
-  server:
-    runs-on: ubuntu-latest
+  ubuntu:
+    runs-on: ubuntu-24.04

-    name: server (${{ matrix.wf_name }})
+    name: ubuntu (${{ matrix.wf_name }})
    strategy:
      matrix:
        build_type: [Release]
@@ -93,11 +93,17 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: server-ubuntu-24.04-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
      - name: Build
        id: cmake_build
        run: |
          cmake -B build \
-            -DLLAMA_BUILD_BORINGSSL=ON \
            -DGGML_SCHED_NO_REALLOC=ON
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

@@ -124,8 +130,8 @@ jobs:
          export ${{ matrix.extra_args }}
          SLOW_TESTS=1 pytest -v -x

-  server-windows:
-    runs-on: windows-2022
+  windows:
+    runs-on: windows-2025

    steps:
      - name: Clone
@@ -135,11 +141,24 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: server-windows-2025-x64
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
      - name: Build
        id: cmake_build
+        shell: cmd
        run: |
-          cmake -B build -DLLAMA_BUILD_BORINGSSL=ON -DGGML_SCHED_NO_REALLOC=ON
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
+          cmake -B build -G "Ninja Multi-Config" ^
+            -DCMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DLLAMA_BUILD_BORINGSSL=ON ^
+            -DGGML_SCHED_NO_REALLOC=ON
+          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
+          cmake --build build --config Release -j %NINJA_JOBS% --target llama-server

      - name: Python setup
        id: setup_python
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -0,0 +1,43 @@
+name: UI Build
+
+on:
+  workflow_call:
+
+jobs:
+  build:
+    runs-on: [self-hosted, fast]
+    env:
+      BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: Install dependencies
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Generate checksums
+        run: |
+          cd tools/ui/dist
+          for f in *; do
+            sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
+          done
+
+      - name: Upload built UI
+        uses: actions/upload-artifact@v6
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+          retention-days: 1
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -0,0 +1,70 @@
+name: UI Publish
+
+on:
+  workflow_call:
+    inputs:
+      version_tag:
+        description: 'Version tag to publish under (e.g., b1234)'
+        required: true
+        type: string
+    secrets:
+      hf_token:
+        description: 'Hugging Face token with write access'
+        required: true
+
+jobs:
+  build:
+    name: Build static output
+    uses: ./.github/workflows/ui-build.yml
+
+  publish:
+    name: Publish UI Static Output
+    needs: build
+    runs-on: ubuntu-24.04-arm
+
+    permissions:
+      contents: read
+
+    env:
+      HF_BUCKET_NAME: ${{ vars.HF_BUCKET_UI_STATIC_OUTPUT }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 1
+
+      - name: Download UI build artifact
+        uses: actions/download-artifact@v7
+        with:
+          name: ui-build
+          path: tools/ui/dist/
+
+      - name: Install Hugging Face Hub CLI
+        run: pip install -U huggingface_hub
+
+      - name: Authenticate with Hugging Face
+        run: hf auth login --token ${{ secrets.hf_token }}
+
+      - name: Sync built files to Hugging Face bucket (version tag)
+        run: |
+          # Upload the built files to the Hugging Face bucket under the release version
+          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
+
+      - name: Sync built files to Hugging Face bucket (latest)
+        run: |
+          # Also upload to the 'latest' directory for fallback downloads
+          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
+
+      - name: Verify upload
+        run: |
+          # List the files in the bucket to verify the upload
+          hf buckets list hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} -R -h
+
+      - name: Clean up root-level files
+        run: |
+          # Clean up any old root-level files from previous non-versioned deployments
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/index.html --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.js --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/bundle.css --yes 2>/dev/null || true
+          hf buckets rm ggml-org/${{ env.HF_BUCKET_NAME }}/loading.html --yes 2>/dev/null || true
--- a/.github/workflows/ui-self-hosted.yml
+++ b/.github/workflows/ui-self-hosted.yml
@@ -0,0 +1,118 @@
+name: UI (self-hosted)
+
+# these are the same as ui.yml, but with self-hosted runners
+# the runners come with pre-installed Playwright browsers version: 1.56.1
+# the jobs are much lighter because they don't need to install node and playwright browsers
+
+on:
+  workflow_dispatch:
+    inputs:
+      sha:
+        description: 'Commit SHA1 to build'
+        required: false
+        type: string
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/ui-self-hosted.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
+    ]
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/ui-self-hosted.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
+    ]
+
+env:
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ui-build:
+    name: Build static output
+    uses: ./.github/workflows/ui-build.yml
+
+  ui-checks:
+    name: Checks
+    needs: ui-build
+    runs-on: [self-hosted, PLAYWRIGHT]
+    continue-on-error: true
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install dependencies
+        id: setup
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Run type checking
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run check
+        working-directory: tools/ui
+
+      - name: Run linting
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run lint
+        working-directory: tools/ui
+
+      - name: Run Client tests
+        if: ${{ always() }}
+        run: npm run test:client
+        working-directory: tools/ui
+
+      - name: Run Unit tests
+        if: ${{ always() }}
+        run: npm run test:unit
+        working-directory: tools/ui
+
+  e2e-tests:
+    name: E2E Tests
+    needs: ui-build
+    runs-on: [self-hosted, PLAYWRIGHT]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Install dependencies
+        id: setup
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Build Storybook
+        if: ${{ always() }}
+        run: npm run build-storybook
+        working-directory: tools/ui
+
+      - name: Run UI tests
+        if: ${{ always() }}
+        run: npm run test:ui -- --testTimeout=60000
+        working-directory: tools/ui
+
+      - name: Run E2E tests
+        if: ${{ always() }}
+        run: npm run test:e2e
+        working-directory: tools/ui
--- a/.github/workflows/server-webui.yml
+++ b/.github/workflows/server-webui.yml
@@ -1,7 +1,7 @@
-name: Server WebUI
+name: UI

 on:
-  workflow_dispatch: # allows manual triggering
+  workflow_dispatch:
    inputs:
      sha:
        description: 'Commit SHA1 to build'
@@ -11,34 +11,39 @@ on:
    branches:
      - master
    paths: [
-      '.github/workflows/server-webui.yml',
-      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
+      '.github/workflows/ui.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
    ]
  pull_request:
    types: [opened, synchronize, reopened]
    paths: [
-      '.github/workflows/server-webui.yml',
-      'tools/server/webui/**.*',
-      'tools/server/tests/**.*',
-      'tools/server/public/**'
+      '.github/workflows/ui.yml',
+      '.github/workflows/ui-build.yml',
+      'tools/ui/**.*',
+      'tools/server/tests/**.*'
    ]

 env:
-  LLAMA_LOG_COLORS: 1
-  LLAMA_LOG_PREFIX: 1
-  LLAMA_LOG_TIMESTAMPS: 1
-  LLAMA_LOG_VERBOSITY: 10
+  LLAMA_ARG_LOG_COLORS: 1
+  LLAMA_ARG_LOG_PREFIX: 1
+  LLAMA_ARG_LOG_TIMESTAMPS: 1
+  LLAMA_ARG_LOG_VERBOSITY: 10

 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
-  webui-check:
-    name: WebUI Checks
-    runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+  ui-build:
+    name: Build static output
+    uses: ./.github/workflows/ui-build.yml
+
+  ui-checks:
+    name: Checks
+    needs: ui-build
+    runs-on: ubuntu-latest
    continue-on-error: true
    steps:
      - name: Checkout code
@@ -51,58 +56,89 @@ jobs:
        id: node
        uses: actions/setup-node@v6
        with:
-          node-version: "22"
+          node-version: "24"
          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
+          cache-dependency-path: "tools/ui/package-lock.json"

      - name: Install dependencies
        id: setup
        if: ${{ steps.node.conclusion == 'success' }}
        run: npm ci
-        working-directory: tools/server/webui
+        working-directory: tools/ui

      - name: Run type checking
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run check
-        working-directory: tools/server/webui
+        working-directory: tools/ui

      - name: Run linting
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npm run lint
-        working-directory: tools/server/webui
-
-      - name: Build application
-        if: ${{ always() && steps.setup.conclusion == 'success' }}
-        run: npm run build
-        working-directory: tools/server/webui
+        working-directory: tools/ui

      - name: Install Playwright browsers
        id: playwright
        if: ${{ always() && steps.setup.conclusion == 'success' }}
        run: npx playwright install --with-deps
-        working-directory: tools/server/webui
-
-      - name: Build Storybook
-        if: ${{ always() && steps.playwright.conclusion == 'success' }}
-        run: npm run build-storybook
-        working-directory: tools/server/webui
+        working-directory: tools/ui

      - name: Run Client tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:client
-        working-directory: tools/server/webui
+        working-directory: tools/ui

      - name: Run Unit tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:unit
-        working-directory: tools/server/webui
+        working-directory: tools/ui
+
+  e2e-tests:
+    name: E2E Tests
+    needs: ui-build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        id: node
+        uses: actions/setup-node@v6
+        with:
+          node-version: "24"
+          cache: "npm"
+          cache-dependency-path: "tools/ui/package-lock.json"
+
+      - name: Install dependencies
+        id: setup
+        if: ${{ steps.node.conclusion == 'success' }}
+        run: npm ci
+        working-directory: tools/ui
+
+      - name: Build application
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npm run build
+        working-directory: tools/ui
+
+      - name: Install Playwright browsers
+        id: playwright
+        if: ${{ always() && steps.setup.conclusion == 'success' }}
+        run: npx playwright install --with-deps
+        working-directory: tools/ui
+
+      - name: Build Storybook
+        if: ${{ always() && steps.playwright.conclusion == 'success' }}
+        run: npm run build-storybook
+        working-directory: tools/ui

      - name: Run UI tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:ui -- --testTimeout=60000
-        working-directory: tools/server/webui
+        working-directory: tools/ui

      - name: Run E2E tests
        if: ${{ always() && steps.playwright.conclusion == 'success' }}
        run: npm run test:e2e
-        working-directory: tools/server/webui
+        working-directory: tools/ui
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -3,18 +3,20 @@ name: Update Operations Documentation
 on:
    push:
        paths:
+            - '.github/workflows/update-ops-docs.yml'
            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'
    pull_request:
        paths:
+            - '.github/workflows/update-ops-docs.yml'
            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'

 jobs:
    update-ops-docs:
-        runs-on: ubuntu-slim
+        runs-on: [self-hosted, fast, ARM64]

        steps:
        - name: Checkout repository
--- a/.gitignore
+++ b/.gitignore
@@ -92,12 +92,12 @@
 !/examples/sycl/*.bat
 !/examples/sycl/*.sh

-# Server Web UI temporary files
+# Server Web UI temporary files (+ legacy directory)

 /tools/server/webui/node_modules
 /tools/server/webui/dist
-# we no longer use gz for index.html
-/tools/server/public/index.html.gz
+/tools/ui/node_modules
+/tools/ui/dist

 # Python

--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -1,7 +1,7 @@
 You are a coding agent. Here are some very important rules that you must follow:

 General:
- By very precise and concise when writing code, comments, explanations, etc.
+- Be very precise and concise when writing code, comments, explanations, etc.
 - PR and commit titles format: `<module> : <title>`. Lookup recents for examples
 - Don't try to build or run the code unless you are explicitly asked to do so
 - Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
@@ -16,12 +16,15 @@ Pull requests (PRs):
 - New branch names are prefixed with "gg/"
 - Before opening a pull request, ask the user to confirm the description
 - When creating a pull request, look for the repository's PR template and follow it
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
+- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
+- Ask the user to tell you what model was used and write it in place of [MODEL]
 - Always create the pull requests in draft mode

 Commits:
 - On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
 - Do not explicitly set the git author in commits - rely on the default git config
+- Always use `--no-gpg-sign` when committing
+- Never `git push` without explicit confirmation from the user

 Resources (read on demand):
 - [CONTRIBUTING.md](CONTRIBUTING.md)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,13 +104,16 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})

 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_WEBUI    "llama: build the embedded Web UI for server"  ON)
-option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})
-option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)
+option(LLAMA_BUILD_TESTS     "llama: build tests"                                                                ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TOOLS     "llama: build tools"                                                                ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES  "llama: build examples"                                                             ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER    "llama: build server example"                                                       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_UI        "llama: build the embedded Web UI for server"                                       ON)
+option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
+
+option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
+option(LLAMA_TESTS_INSTALL "llama: install tests" ON)

 # 3rd party libs
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
@@ -215,6 +218,10 @@ if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)
    add_subdirectory(tools)
 endif()

+if (LLAMA_BUILD_APP)
+    add_subdirectory(app)
+endif()
+
 # Automatically add all files from the 'licenses' directory
 file(GLOB EXTRA_LICENSES "${CMAKE_SOURCE_DIR}/licenses/LICENSE-*")

@@ -270,18 +277,6 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)

-install(
-    FILES convert_hf_to_gguf.py
-    PERMISSIONS
-        OWNER_READ
-        OWNER_WRITE
-        OWNER_EXECUTE
-        GROUP_READ
-        GROUP_EXECUTE
-        WORLD_READ
-        WORLD_EXECUTE
-    DESTINATION ${CMAKE_INSTALL_BINDIR})
-
 configure_file(cmake/llama.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        @ONLY)
--- a/6
+++ b/6
@@ -15,7 +15,7 @@
 # ggml-org/llama-common     : ggerganov, aldehir, angt, danbev, ngxson, pwilkin
 # ggml-org/llama-mtmd       : ngxson
 # ggml-org/llama-server     : ggerganov, ngxson, allozaur, angt, ServeurpersoCom
-# ggml-org/llama-webui      : allozaur
+# ggml-org/llama-ui           : allozaur

 /.devops/*.Dockerfile                   @ngxson
 /.github/actions/                       @ggml-org/ci
@@ -26,6 +26,7 @@
 /common/fit.*                           @JohannesGaessler
 /common/jinja/                          @CISC
 /common/ngram-map.*                     @srogmann
+/conversion/                            @CISC
 /convert_*.py                           @CISC
 /docs/backend/snapdragon/               @ggml-org/ggml-hexagon
 /examples/batched.swift/                @ggerganov
@@ -48,7 +49,6 @@
 /examples/parallel/                     @ggerganov
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
-/examples/save-load-state/              @ggerganov
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
@@ -107,7 +107,7 @@
 /tools/rpc/                             @ggml-org/ggml-rpc
 /tools/server/*                         @ggml-org/llama-server # no subdir
 /tools/server/tests/                    @ggml-org/llama-server
-/tools/server/webui/                    @ggml-org/llama-webui
+/tools/ui/                              @ggml-org/llama-ui
 /tools/tokenize/                        @ggerganov
 /tools/tts/                             @ggerganov
 /vendor/                                @ggerganov
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -46,7 +46,9 @@ Before submitting your PR:
    - provide KL divergence data calculated vs. the FP16/BF16 (whichever is the native precision) version for both the new type as well as types of similar size
    - provide [performance data](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) for the new type in comparison to types of similar size on pure CPU
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If you are a new contributor, limit your open PRs to 1.
+- If you are a new contributor
+    - Limit your open PRs to 1
+    - Do not submit trivial fixes (e.g. typos, formatting changes)

 After submitting your PR:
 - Expect requests for modifications to ensure the code meets llama.cpp's standards for quality and long-term maintainability
@@ -61,6 +63,7 @@ After submitting your PR:
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
 - Let other maintainers merge their own PRs
 - When merging a PR, make sure you have a good understanding of the changes
+- If a PR does not warrant a new release, add `[no release]` in the squashed commit to spare CI resources
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

 Maintainers reserve the right to decline review or close pull requests for any reason, without any questions, particularly under any of the following conditions:
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ LLM inference in C/C++
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).

 ----

@@ -172,6 +173,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
 - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Ruby: [docusealco/rllama](https://github.com/docusealco/rllama)
 - Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
@@ -279,7 +281,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Metal](docs/build.md#metal-build) | Apple Silicon |
 | [BLAS](docs/build.md#blas-build) | All |
 | [BLIS](docs/backend/BLIS.md) | All |
-| [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU |
+| [SYCL](docs/backend/SYCL.md) | Intel GPU |
 | [OpenVINO [In Progress]](docs/backend/OPENVINO.md) | Intel CPUs, GPUs, and NPUs |
 | [MUSA](docs/build.md#musa) | Moore Threads GPU |
 | [CUDA](docs/build.md#cuda) | Nvidia GPU |
@@ -289,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
-| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
+| [WebGPU](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(TARGET llama-app)
+
+add_executable(${TARGET} llama.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama)
+
+target_link_libraries(${TARGET} PRIVATE
+    llama-server-impl
+    llama-cli-impl
+    llama-completion-impl
+    llama-bench-impl
+    llama-batched-bench-impl
+    llama-fit-params-impl
+    llama-quantize-impl
+    llama-perplexity-impl
+)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} RUNTIME)
+endif()
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -0,0 +1,95 @@
+#include "build-info.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+// visible
+int llama_server(int argc, char ** argv);
+int llama_cli(int argc, char ** argv);
+
+// hidden
+int llama_completion(int argc, char ** argv);
+int llama_bench(int argc, char ** argv);
+int llama_batched_bench(int argc, char ** argv);
+int llama_fit_params(int argc, char ** argv);
+int llama_quantize(int argc, char ** argv);
+int llama_perplexity(int argc, char ** argv);
+
+static int help(int argc, char ** argv);
+static int version(int argc, char ** argv);
+
+struct command {
+    const char * name;
+    const char * desc;
+    std::vector<std::string> aliases;
+    bool hidden;
+    int (*func)(int, char **);
+};
+
+static const command cmds[] = {
+    {"serve",         "HTTP API server",                                    {"server"},   false, llama_server       },
+    {"cli",           "Command-line interactive interface",                 {"client"},   false, llama_cli          },
+    {"completion",    "Text completion",                                    {"complete"}, true,  llama_completion   },
+    {"bench",         "Benchmark prompt processing and text generation",    {},           true,  llama_bench        },
+    {"batched-bench", "Benchmark batched decoding performance",             {},           true,  llama_batched_bench},
+    {"fit-params",    "Compute parameters to fit a model in device memory", {},           true,  llama_fit_params   },
+    {"quantize",      "Quantize a model",                                   {},           true,  llama_quantize     },
+    {"perplexity",    "Compute model perplexity and KL divergence",         {},           true,  llama_perplexity   },
+    {"version",       "Show version",                                       {},           true,  version            },
+    {"help",          "Show available commands",                            {},           true,  help               },
+};
+
+static int version(int argc, char ** argv) {
+    printf("%s\n", llama_build_info());
+    return 0;
+}
+
+static int help(int argc, char ** argv) {
+    const bool show_all = argc >= 2 && std::string(argv[1]) == "all";
+
+    printf("Usage: llama <command> [options]\n\nAvailable commands:\n");
+
+    for (const auto & cmd : cmds) {
+        if (show_all || !cmd.hidden) {
+            printf("  %-15s %s\n", cmd.name, cmd.desc);
+        }
+    }
+    printf("\nRun 'llama <command> --help' for command-specific usage.\n");
+
+    return 0;
+}
+
+static bool matches(const std::string & arg, const command & cmd) {
+    if (arg == cmd.name) {
+        return true;
+    }
+    for (const auto & alias : cmd.aliases) {
+        if (arg == alias) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int main(int argc, char ** argv) {
+    const std::string arg = argc >= 2 ? argv[1] : "help";
+
+    for (const auto & cmd : cmds) {
+        if (matches(arg, cmd)) {
+
+            // router spawns children through this same binary, it needs the
+            // subcommand to relaunch as 'llama serve' and not bare options
+#ifdef _WIN32
+            _putenv_s("LLAMA_APP_CMD", cmd.name);
+#else
+            setenv("LLAMA_APP_CMD", cmd.name, 1);
+#endif
+            return cmd.func(argc - 1, argv + 1);
+        }
+    }
+
+    fprintf(stderr, "error: unknown command '%s'\n", arg.c_str());
+    return 1;
+}
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
 TVOS_MIN_OS_VERSION=16.4

 BUILD_SHARED_LIBS=OFF
+LLAMA_BUILD_APP=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
    -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
    -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
    -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+    -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
    -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
    -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
    -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -66,6 +66,8 @@ fi

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+else
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -114,9 +116,12 @@ fi
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"

-    # if on Mac, disable METAL
    if [[ "$OSTYPE" == "darwin"* ]]; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+        MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION="/usr/local/lib/cmake/vulkan"
+        MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION="${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers/SPIRV-HeadersConfig.cmake"
+        if [[ -f "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" || -h "${MACOS_RUNNER_CUSTOM_SPIRV_HEADERS_LOCATION}" ]]; then
+            CMAKE_EXTRA="${CMAKE_EXTRA} -DSPIRV-Headers_DIR=${MACOS_RUNNER_CUSTOM_VULKAN_CMAKE_LOCATION}/SPIRV-Headers"
+        fi
    fi

    # Build shared libs on Windows
@@ -127,7 +132,7 @@ if [ ! -z ${GG_BUILD_VULKAN} ]; then
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1 -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"

    if [ ! -z "${GG_BUILD_WEBGPU_DAWN_PREFIX}" ]; then
        if [ -z "${CMAKE_PREFIX_PATH}" ]; then
@@ -161,6 +166,8 @@ fi

 if [ ! -z ${GG_BUILD_BLAS} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=${GG_BUILD_BLAS_VENDOR:-OpenBLAS}"
+else
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_BLAS=OFF"
 fi

 if [ ! -z ${GG_BUILD_OPENVINO} ]; then
@@ -232,7 +239,7 @@ function gg_run_ctest_debug {
    (cmake -G "${CMAKE_GENERATOR}" -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time cmake --build . --config Debug -j$(nproc)) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest -C Debug --output-on-failure -L main -E "test-opt|test-backend-ops|test-llama-archs" ${CTEST_EXTRA}) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -455,10 +462,10 @@ function gg_run_qwen3_0_6b {

    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -694,8 +701,8 @@ function gg_sum_test_backend_ops_cpu {

 ## main

-export LLAMA_LOG_PREFIX=1
-export LLAMA_LOG_TIMESTAMPS=1
+export LLAMA_ARG_LOG_PREFIX=1
+export LLAMA_ARG_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@@ -7,7 +7,7 @@ set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)

 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")

 find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)

--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -4,7 +4,6 @@
 #include "chat.h"
 #include "common.h"
 #include "download.h"
-#include "hf-cache.h"
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
@@ -308,12 +307,14 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
    common_download_opts opts;
    opts.bearer_token = params.hf_token;
    opts.offline = params.offline;
+
+    LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str());
    const int status = common_download_file_single(preset_url, preset_path, opts);
    const bool has_preset = status >= 200 && status < 400;

    // remote preset is optional, so we don't error out if not found
    if (has_preset) {
-        LOG_INF("applying remote preset from %s\n", preset_url.c_str());
+        LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str());
        common_preset_context ctx(ex, /* only_remote_allowed */ true);
        common_preset global;
        auto remote_presets = ctx.load_from_ini(preset_path, global);
@@ -326,7 +327,7 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
            throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section");
        }
    } else {
-        LOG_INF("%s", "no remote preset found, skipping\n");
+        LOG_TRC("%s: no remote preset found, skipping\n", __func__);
    }

    return has_preset;
@@ -335,11 +336,15 @@ static bool common_params_handle_remote_preset(common_params & params, llama_exa
 struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
+
+    bool found_mtp = false;
+    common_params_model mtp;
 };

 static handle_model_result common_params_handle_model(struct common_params_model & model,
                                                      const std::string          & bearer_token,
-                                                      bool                         offline) {
+                                                      bool                         offline,
+                                                      bool                         search_mtp = false) {
    handle_model_result result;

    if (!model.docker_repo.empty()) {
@@ -354,7 +359,7 @@ static handle_model_result common_params_handle_model(struct common_params_model
        common_download_opts opts;
        opts.bearer_token = bearer_token;
        opts.offline = offline;
-        auto download_result = common_download_model(model, opts, true);
+        auto download_result = common_download_model(model, opts, true, search_mtp);

        if (download_result.model_path.empty()) {
            throw std::runtime_error("failed to download model from Hugging Face");
@@ -367,6 +372,11 @@ static handle_model_result common_params_handle_model(struct common_params_model
            result.found_mmproj = true;
            result.mmproj.path  = download_result.mmproj_path;
        }
+
+        if (!download_result.mtp_path.empty()) {
+            result.found_mtp = true;
+            result.mtp.path  = download_result.mtp_path;
+        }
    } else if (!model.url.empty()) {
        if (model.path.empty()) {
            auto f = string_split<std::string>(model.url, '#').front();
@@ -434,7 +444,11 @@ static bool parse_bool_value(const std::string & value) {
 //

 void common_params_handle_models(common_params & params, llama_example curr_ex) {
-    auto res = common_params_handle_model(params.model, params.hf_token, params.offline);
+    const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(),
+                                         params.speculative.types.end(),
+                                         COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end();
+
+    auto res = common_params_handle_model(params.model, params.hf_token, params.offline, spec_type_draft_mtp);
    if (params.no_mmproj) {
        params.mmproj = {};
    } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -448,6 +462,14 @@ void common_params_handle_models(common_params & params, llama_example curr_ex)
            break;
        }
    }
+    // when --spec-type mtp is set and no draft model was provided explicitly,
+    // fall back to the MTP head discovered alongside the -hf model
+    if (spec_type_draft_mtp && res.found_mtp &&
+        params.speculative.draft.mparams.path.empty() &&
+        params.speculative.draft.mparams.hf_repo.empty() &&
+        params.speculative.draft.mparams.url.empty()) {
+        params.speculative.draft.mparams.path = res.mtp.path;
+    }
    common_params_handle_model(params.speculative.draft.mparams, params.hf_token, params.offline);
    common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
 }
@@ -514,7 +536,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
            }
            if (!seen_args.insert(arg).second) {
-                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+                const bool skip = (arg == "--spec-type");
+
+                if (!skip) {
+                    LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+                }
            }
            auto & tmp = arg_to_options[arg];
            auto opt = *tmp.first;
@@ -563,12 +589,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // parse the first time to get -hf option (used for remote preset)
    parse_cli_args();

-    // TODO: Remove later
-    try {
-        hf_cache::migrate_old_cache_to_hf_cache(params.hf_token, params.offline);
-    } catch (const std::exception & e) {
-        LOG_WRN("HF cache migration failed: %s\n", e.what());
-    }
    // export_graph_ops loads only metadata
    const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS;

@@ -877,7 +897,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<com
            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
        if (!seen_args.insert(arg).second) {
-            LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+            const bool skip = (arg == "--spec-type");
+
+            if (!skip) {
+                LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str());
+            }
        }
        auto opt = *arg_to_options[arg];
        std::string val;
@@ -1310,12 +1334,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
-        {"-cpent", "--checkpoint-every-n-tokens"}, "N",
-        string_format("create a checkpoint every n tokens during prefill (processing), -1 to disable (default: %d)", params.checkpoint_every_nt),
+        {"-cms", "--checkpoint-min-step"}, "N",
+        string_format("minimum spacing between context checkpoints in tokens (default: %d, 0 = no minimum)", params.checkpoint_min_step),
        [](common_params & params, int value) {
-            params.checkpoint_every_nt = value;
+            if (value < 0) {
+                throw std::invalid_argument("checkpoint-min-step must be non-negative");
+            }
+            params.checkpoint_min_step = value;
        }
-    ).set_env("LLAMA_ARG_CHECKPOINT_EVERY_NT").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
+    ).set_env("LLAMA_ARG_CHECKPOINT_MIN_SPACING_NT").set_examples({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cram", "--cache-ram"}, "N",
        string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
@@ -2785,7 +2812,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, int value) {
            params.embd_normalize = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_DEBUG}));
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_DEBUG}));
    add_opt(common_arg(
        {"--embd-output-format"}, "FORMAT",
        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
@@ -2842,28 +2869,64 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.api_prefix = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
+    // Deprecated: use --ui-config instead (kept for backward compat)
    add_opt(common_arg(
        {"--webui-config"}, "JSON",
-        "JSON that provides default WebUI settings (overrides WebUI defaults)",
+        "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)",
        [](common_params & params, const std::string & value) {
+            params.ui_config_json = value;
            params.webui_config_json = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG"));
+
+    add_opt(common_arg(
+        {"--ui-config"}, "JSON",
+        "JSON that provides default UI settings (overrides UI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.ui_config_json = value;
+            params.webui_config_json = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG"));
+
+    // Deprecated: use --ui-config-file instead (kept for backward compat)
    add_opt(common_arg(
        {"--webui-config-file"}, "PATH",
-        "JSON file that provides default WebUI settings (overrides WebUI defaults)",
+        "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)",
        [](common_params & params, const std::string & value) {
-            params.webui_config_json = read_file(value);
+            params.ui_config_json = read_file(value);
+            params.webui_config_json = params.ui_config_json;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE"));
+
+    add_opt(common_arg(
+        {"--ui-config-file"}, "PATH",
+        "JSON file that provides default UI settings (overrides UI defaults)",
+        [](common_params & params, const std::string & value) {
+            params.ui_config_json = read_file(value);
+            params.webui_config_json = params.ui_config_json;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE"));
+
+    // Deprecated: use --ui-mcp-proxy instead (kept for backward compat)
    add_opt(common_arg(
        {"--webui-mcp-proxy"},
        {"--no-webui-mcp-proxy"},
-        string_format("experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: %s)", params.webui_mcp_proxy ? "enabled" : "disabled"),
+        "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy",
        [](common_params & params, bool value) {
+            params.ui_mcp_proxy = value;
            params.webui_mcp_proxy = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY"));
+
+    add_opt(common_arg(
+        {"--ui-mcp-proxy"},
+        {"--no-ui-mcp-proxy"},
+        "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)",
+        [](common_params & params, bool value) {
+            params.ui_mcp_proxy = value;
+            params.webui_mcp_proxy = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY"));
    add_opt(common_arg(
        {"--tools"}, "TOOL1,TOOL2,...",
        "experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)\n"
@@ -2873,14 +2936,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.server_tools = parse_csv_row(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS"));
+    // Deprecated: use --ui/--no-ui instead (kept for backward compat)
    add_opt(common_arg(
        {"--webui"},
        {"--no-webui"},
-        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI",
        [](common_params & params, bool value) {
+            params.ui = value;
            params.webui = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
+
+    add_opt(common_arg(
+        {"--ui"},
+        {"--no-ui"},
+        string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.ui = value;
+            params.webui = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI"));
    add_opt(common_arg(
        {"--embedding", "--embeddings"},
        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2923,7 +2998,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
            key_file.close();
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_KEY_FILE"));
    add_opt(common_arg(
        {"--ssl-key-file"}, "FNAME",
        "path to file a PEM-encoded SSL private key",
@@ -2951,7 +3026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.default_template_kwargs[item.key()] = item.value().dump();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CHAT_TEMPLATE_KWARGS"));
    add_opt(common_arg(
        {"-to", "--timeout"}, "N",
        string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@@ -3252,7 +3327,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params &, const std::string & value) {
            common_log_set_file(common_log_main(), value.c_str());
        }
-    ).set_env("LLAMA_LOG_FILE"));
+    ).set_env("LLAMA_ARG_LOG_FILE"));
    add_opt(common_arg(
        {"--log-colors"}, "[on|off|auto]",
        "Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
@@ -3269,7 +3344,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                    string_format("error: unknown value for --log-colors: '%s'\n", value.c_str()));
            }
        }
-    ).set_env("LLAMA_LOG_COLORS"));
+    ).set_env("LLAMA_ARG_LOG_COLORS"));
    add_opt(common_arg(
        {"-v", "--verbose", "--log-verbose"},
        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
@@ -3284,7 +3359,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.offline = true;
        }
-    ).set_env("LLAMA_OFFLINE"));
+    ).set_env("LLAMA_ARG_OFFLINE"));
    add_opt(common_arg(
        {"-lv", "--verbosity", "--log-verbosity"}, "N",
        string_format("Set the verbosity threshold. Messages with a higher verbosity will be ignored. Values:\n"
@@ -3292,27 +3367,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            " - 1: error\n"
            " - 2: warning\n"
            " - 3: info\n"
-            " - 4: debug\n"
+            " - 4: trace (more info)\n"
+            " - 5: debug\n"
            "(default: %d)\n", params.verbosity),
        [](common_params & params, int value) {
            params.verbosity = value;
            common_log_set_verbosity_thold(value);
        }
-    ).set_env("LLAMA_LOG_VERBOSITY"));
+    ).set_env("LLAMA_ARG_LOG_VERBOSITY"));
    add_opt(common_arg(
        {"--log-prefix"},
+        {"--no-log-prefix"},
        "Enable prefix in log messages",
-        [](common_params &) {
-            common_log_set_prefix(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_prefix(common_log_main(), value);
        }
-    ).set_env("LLAMA_LOG_PREFIX"));
+    ).set_env("LLAMA_ARG_LOG_PREFIX"));
    add_opt(common_arg(
        {"--log-timestamps"},
+        {"--no-log-timestamps"},
        "Enable timestamps in log messages",
-        [](common_params &) {
-            common_log_set_timestamps(common_log_main(), true);
+        [](common_params &, bool value) {
+            common_log_set_timestamps(common_log_main(), value);
        }
-    ).set_env("LLAMA_LOG_TIMESTAMPS"));
+    ).set_env("LLAMA_ARG_LOG_TIMESTAMPS"));

    //
    // speculative parameters
@@ -3516,6 +3594,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.speculative.draft.p_min = std::stof(value);
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
+    add_opt(common_arg(
+        {"--spec-draft-backend-sampling"},
+        {"--no-spec-draft-backend-sampling"},
+        string_format("offload draft sampling to the backend (default: %s)",
+                      params.speculative.draft.backend_sampling ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.speculative.draft.backend_sampling = value;
+        }
+    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING"));
    add_opt(common_arg(
        {"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -3556,8 +3643,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("comma-separated list of types of speculative decoding to use (default: %s)\n",
            common_speculative_type_name_str(params.speculative.types).c_str()),
        [](common_params & params, const std::string & value) {
-            const auto enabled_types = string_split<std::string>(value, ',');
-            params.speculative.types = common_speculative_types_from_names(enabled_types);
+            const auto types_str = string_split<std::string>(value, ',');
+            auto types = common_speculative_types_from_names(types_str);
+            params.speculative.types.insert(params.speculative.types.end(), types.begin(), types.end());
        }
    ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
    add_opt(common_arg(
@@ -4046,10 +4134,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--spec-default"},
        string_format("enable default speculative decoding config"),
        [](common_params & params) {
-            params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
+            params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MOD);
            params.speculative.ngram_mod.n_match = 24;
            params.speculative.ngram_mod.n_min = 48;
            params.speculative.ngram_mod.n_max = 64;
+
+            // TODO: not sure if this is a good config - explore more settings and potentially enable it
+            //params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V);
+            //params.speculative.ngram_map_k4v.size_n = 8;
+            //params.speculative.ngram_map_k4v.size_m = 24;
+            //params.speculative.ngram_map_k4v.min_hits = 2;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));

--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -43,11 +43,33 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
                                                  const autoparser &              autoparser) {
    // Create the result structure
    common_chat_params data;
-    data.prompt           = common_chat_template_direct_apply(tmpl, inputs);
-    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens = autoparser.preserved_tokens;
+    data.prompt            = common_chat_template_direct_apply(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens  = autoparser.preserved_tokens;

-    auto parser = autoparser.build_parser(inputs);
+    std::string parser_generation_prompt = data.generation_prompt;
+
+    if (inputs.continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !inputs.continue_msg.empty()) {
+        // Build up generation prompt manually
+        const auto & msg = inputs.continue_msg;
+
+        if (!autoparser.reasoning.start.empty()) {
+            data.generation_prompt = data.generation_prompt.substr(0, data.generation_prompt.find(autoparser.reasoning.start));
+            data.generation_prompt += autoparser.reasoning.start + msg.reasoning_content;
+            if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+                data.generation_prompt += autoparser.reasoning.end;
+            }
+        }
+
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
+    auto parser = autoparser.build_parser(inputs, parser_generation_prompt);
    data.parser = parser.save();

    // Build grammar if tools are present
@@ -87,7 +109,7 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    return data;
 }

-common_peg_arena autoparser::build_parser(const generation_params & inputs) const {
+common_peg_arena autoparser::build_parser(const generation_params & inputs, const std::string & generation_prompt) const {
    if (!analysis_complete) {
        throw std::invalid_argument("Cannot call build_parser on autoparser without performing analysis first, call analyze_template(...)");
    }
@@ -121,7 +143,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
        } else {
            parser = content.build_parser(ctx);
        }
-        return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
+        return pure_content ? p.prefix(generation_prompt, reasoning.start) + parser : p.prefix(generation_prompt, reasoning.start) << parser;
    });
 }

--- a/common/chat-auto-parser-helpers.cpp
+++ b/common/chat-auto-parser-helpers.cpp
@@ -310,6 +310,8 @@ std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segm

 namespace autoparser {

+static const std::string ERR_TMPL = "#**ERROR**#";
+
 std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
    generation_params tmpl_params;
    tmpl_params.messages              = params.messages;
@@ -326,7 +328,7 @@ std::string apply_template(const common_chat_template & tmpl, const template_par
        return common_chat_template_direct_apply(tmpl, tmpl_params);
    } catch (const std::exception & e) {
        LOG_DBG("Template application failed: %s\n", e.what());
-        return "";
+        return ERR_TMPL;
    }
 }

@@ -347,7 +349,7 @@ std::optional<compare_variants_result> compare_variants(
    std::string output_B = apply_template(tmpl, params_B);

    // Check for template application failures
-    if (output_A.empty() || output_B.empty()) {
+    if (output_A == ERR_TMPL || output_B == ERR_TMPL) {
        return std::nullopt;
    }

--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -60,16 +60,21 @@ struct generation_params {
    common_reasoning_format               reasoning_format    = COMMON_REASONING_FORMAT_AUTO;
    bool                                  stream              = true;
    std::string                           grammar;
-    bool                                  add_generation_prompt = false;
-    bool                                  enable_thinking       = true;
-    std::chrono::system_clock::time_point now                   = std::chrono::system_clock::now();
-    std::string                           generation_prompt;
+    bool                                  add_generation_prompt  = false;
+    common_chat_continuation              continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
+    common_chat_msg                       continue_msg;
+    bool                                  enable_thinking        = true;
+    std::chrono::system_clock::time_point now                    = std::chrono::system_clock::now();
    json                                  extra_context;
    bool                                  add_bos       = false;
    bool                                  add_eos       = false;
    bool                                  is_inference  = true;
    bool                                  add_inference = false;
    bool                                  mark_input    = true;  // whether to mark input strings in the jinja context
+
+    bool has_continuation() const {
+        return continue_final_message != COMMON_CHAT_CONTINUATION_NONE && !continue_msg.empty();
+    }
 };

 // ============================================================================
@@ -372,6 +377,8 @@ struct analyze_tools : analyze_base {

 struct autoparser {
    jinja::caps          jinja_caps;
+    std::string          user_start;
+    std::string          assistant_start;
    analyze_reasoning    reasoning;
    analyze_content      content;
    analyze_tools        tools;
@@ -382,11 +389,15 @@ struct autoparser {

    autoparser() = default;

+    // Find the starting marker for the user message and assistant message
+    std::string detect_user_start_marker(const common_chat_template & tmpl);
+    std::string detect_assistant_start_marker(const common_chat_template & tmpl);
+
    // Run full differential analysis on a template
    void analyze_template(const common_chat_template & tmpl);

    // Build the PEG parser for this template
-    common_peg_arena build_parser(const generation_params & inputs) const;
+    common_peg_arena build_parser(const generation_params & inputs, const std::string & generation_prompt) const;

  private:
    // Collect tokens from entire analysis to preserve
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -8,6 +8,9 @@
 #include "peg-parser.h"

 #include <algorithm>
+#include <cctype>
+#include <ostream>
+#include <sstream>

 #define ANSI_RESET  "\033[0m"
 #define ANSI_PURPLE "\033[1m\x1b[38;5;126m"
@@ -23,6 +26,7 @@ static const std::string FUN_SECOND = "SSS_SECOND_FUN_S";
 static const std::string ARG_FIRST = "AA_ARG_FST_AA";
 static const std::string ARG_SECOND = "BB_ARG_SND_BB";
 static const std::string USER_MSG = "U_USER_MSG Hello END_U";
+static const std::string USER_MSG_TWO = "V_USER_MSG Hello END_V";
 static const std::string ASSISTANT_MSG = "A_ASST_MSG I can help END_A";
 static const std::string THINKING_CONTENT = "REASON_PART I am thinking END_R";
 static const std::string CALL_ID_001 = "call00001";
@@ -71,6 +75,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              analysis.content.end   = "<|END_OF_TURN_TOKEN|>";
              analysis.preserved_tokens.push_back("<|CHATBOT_TOKEN|>");
              analysis.preserved_tokens.push_back("<|END_OF_TURN_TOKEN|>");
+              analysis.user_start = "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>";
              LOG_DBG(ANSI_ORANGE "[Patch: Cohere Command R+]\n" ANSI_RESET);
          }
      },
@@ -108,7 +113,59 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
              analysis.tools.function.close        = "```";
              LOG_DBG(ANSI_ORANGE "[Patch: DeepSeek-R1-Distill-Qwen]\n" ANSI_RESET);
          }
-      }
+      },
+      // Nemotron Nano v2
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("<SPECIAL_10>") != std::string::npos && tmpl.src.find("<SPECIAL_11>") != std::string::npos &&
+              tmpl.src.find("<SPECIAL_12>") != std::string::npos && tmpl.src.find("<TOOL_RESPONSE>") != std::string::npos) {
+
+              analysis.tools.format.mode           = tool_format::JSON_NATIVE;
+              analysis.tools.format.section_start  = "";
+              analysis.tools.format.section_end    = "";
+              analysis.tools.format.per_call_start = "<TOOLCALL>";
+              analysis.tools.format.per_call_end   = "</TOOLCALL>";
+              analysis.content.mode                = content_mode::PLAIN;
+              analysis.content.start               = "";
+              analysis.content.end                 = "";
+              analysis.reasoning.mode              = reasoning_mode::TAG_BASED;
+              analysis.reasoning.start             = "<think>\n\n";
+              analysis.reasoning.end               = "</think>";
+              analysis.assistant_start             = "<SPECIAL_11>Assistant";
+              analysis.user_start                  = "<SPECIAL_11>User";
+              analysis.preserved_tokens.clear();
+              analysis.preserved_tokens.push_back("<SPECIAL_12>");
+              analysis.preserved_tokens.push_back("<SPECIAL_11>");
+              analysis.preserved_tokens.push_back("</think>");
+              analysis.preserved_tokens.push_back("<TOOLCALL>");
+              analysis.preserved_tokens.push_back("</TOOLCALL>");
+              LOG_DBG(ANSI_ORANGE "[Patch: Nemotron Nano v2]\n" ANSI_RESET);
+          }
+      },
+      // Fireworks
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("{%- set system_prompt = '<|start_header_id|>' + 'system' + '<|end_header_id|>\\n\\n'"
+            " + message['content'] | trim + '\\n' + system_prompt_suffix + '<|eot_id|>' -%}") != std::string::npos) {
+              analysis.assistant_start             = "<|start_header_id|>assistant<|end_header_id|>";
+              analysis.user_start                  = "<|start_header_id|>user<|end_header_id|>";
+              LOG_DBG(ANSI_ORANGE "[Patch: Fireworks v2]\n" ANSI_RESET);
+          }
+      },
+      // Solar Open
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("<|begin|>assistant<|think|><|end|>") != std::string::npos) {
+              analysis.assistant_start             = "<|begin|>assistant";
+              LOG_DBG(ANSI_ORANGE "[Patch: Solar Open]\n" ANSI_RESET);
+          }
+      },
+      // Apriel 1.6
+      [](const common_chat_template & tmpl, autoparser & analysis) -> void {
+          if (tmpl.src.find("if not loop.last and '[BEGIN FINAL RESPONSE]' in asst_text") != std::string::npos) {
+              analysis.user_start                  = "<|begin_user|>";
+              analysis.assistant_start             = "<|begin_assistant|>";
+              LOG_DBG(ANSI_ORANGE "[Patch: Apriel 1.6]\n" ANSI_RESET);
+          }
+      },
+
    });

 // Common JSON structures
@@ -166,6 +223,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    reasoning = analyze_reasoning(tmpl, jinja_caps.supports_tool_calls);
    content = analyze_content(tmpl, reasoning);
    tools = analyze_tools(jinja_caps.supports_tool_calls ? analyze_tools(tmpl, jinja_caps, reasoning) : analyze_tools());
+    assistant_start = detect_assistant_start_marker(tmpl);
+    user_start = detect_user_start_marker(tmpl);
    collect_preserved_tokens();

    for (auto & workaround : workarounds) {
@@ -173,6 +232,8 @@ void autoparser::analyze_template(const common_chat_template & tmpl) {
    }

    LOG_DBG("\n--- Reasoning & Content Structure ---\n");
+    LOG_DBG("user_msg_start: %s\n", user_start.c_str());
+    LOG_DBG("assistant_msg_start: %s\n", assistant_start.c_str());
    LOG_DBG("reasoning_mode: %s\n", mode_to_str(reasoning.mode).c_str());
    LOG_DBG("reasoning_start: '%s'\n", reasoning.start.c_str());
    LOG_DBG("reasoning_end: '%s'\n", reasoning.end.c_str());
@@ -245,6 +306,120 @@ void autoparser::collect_preserved_tokens() {
    add_token(tools.call_id.suffix);
 }

+std::string autoparser::detect_assistant_start_marker(const common_chat_template & tmpl) {
+    json user_msg = json{
+        { "role",    "user"   },
+        { "content", USER_MSG }
+    };
+
+    json assistant_no_reasoning = json{
+        { "role",    "assistant"   },
+        { "content", ASSISTANT_MSG }
+    };
+
+    template_params params;
+    params.messages              = json::array({ user_msg });
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        tmpl, params, [&](template_params & p) {
+            p.messages = json::array({ user_msg, assistant_no_reasoning });
+        }
+    );
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed, skipping assistant start detection\n" ANSI_RESET, __func__);
+        return "";
+    }
+
+    auto usermsg = comparison->diff.right;
+    if (usermsg.find(ASSISTANT_MSG) == std::string::npos) {
+        LOG_DBG(ANSI_ORANGE "%s: Did not find assistant message in assistant message block, skipping detection\n" ANSI_RESET, __func__);
+    }
+
+    auto ast_prefix = usermsg.substr(0, usermsg.find(ASSISTANT_MSG));
+    if (!reasoning.start.empty() && ast_prefix.find(trim_whitespace(reasoning.start)) != std::string::npos) {
+        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.start)));
+    }
+    if (!reasoning.end.empty() && ast_prefix.find(trim_whitespace(reasoning.end)) != std::string::npos) {
+        ast_prefix = ast_prefix.substr(0, ast_prefix.find(trim_whitespace(reasoning.end)));
+    }
+    return trim_whitespace(ast_prefix);
+}
+
+std::string autoparser::detect_user_start_marker(const common_chat_template & tmpl) {
+    json user_msg = json{
+        { "role",    "user"   },
+        { "content", USER_MSG }
+    };
+
+    json assistant = json{
+        { "role",    "assistant"   },
+        { "content", ASSISTANT_MSG }
+    };
+
+    json user_msg_two = json{
+        { "role",    "user"       },
+        { "content", USER_MSG_TWO }
+    };
+
+    template_params params;
+    params.messages              = json::array({});
+    params.add_generation_prompt = false;
+    params.enable_thinking       = true;
+
+    auto comparison = compare_variants(
+        tmpl, params, [&](template_params & p) {
+            p.messages = json::array({ user_msg });
+        }
+    );
+
+    if (!comparison) {
+        LOG_DBG(ANSI_ORANGE "%s: Template application failed, unsupported empty messages? trying complex variant\n" ANSI_RESET, __func__);
+        params.messages = json::array({ user_msg_two, assistant });
+        comparison = compare_variants(
+            tmpl, params, [&](template_params & p) {
+                p.messages = json::array({ user_msg_two, assistant, user_msg });
+            }
+        );
+        if (!comparison) {
+            LOG_DBG(ANSI_ORANGE "%s: Template application failed for reserve variant, aborting\n" ANSI_RESET, __func__);
+            return "";
+        }
+    }
+
+    auto usermsg = comparison->diff.right;
+    if (usermsg.find(USER_MSG) == std::string::npos) {
+        LOG_DBG(ANSI_ORANGE "%s: Did not find user message in user message block, aborting detection\n" ANSI_RESET, __func__);
+    }
+
+    if (usermsg.find(ASSISTANT_MSG) != std::string::npos) {
+        usermsg = usermsg.substr(usermsg.find(ASSISTANT_MSG) + ASSISTANT_MSG.size());
+    }
+
+    auto candidate = usermsg.substr(0, usermsg.find(USER_MSG));
+    auto candidate_split = segmentize_markers(candidate);
+    std::stringstream result;
+    bool encountered_marker = false;
+    for (const auto & mrk : candidate_split) {
+        std::string lower_mrk = std::string(mrk.value);
+        std::transform(lower_mrk.begin(), lower_mrk.end(), lower_mrk.begin(),
+            [](unsigned char c) { return std::tolower(c); });
+        // heuristic to weed out potential end markers, but only at the start
+        if (mrk.type == segment_type::MARKER && !encountered_marker &&
+            (lower_mrk.find("end") != std::string::npos || lower_mrk.find("close") != std::string::npos)) {
+            continue;
+        }
+        if (mrk.type == segment_type::TEXT && !encountered_marker && trim_whitespace(mrk.value).empty()) {
+            continue;
+        }
+        encountered_marker |= mrk.type == segment_type::MARKER;
+        result << mrk.value;
+    }
+    return trim_whitespace(result.str());
+}
+
 analyze_reasoning::analyze_reasoning(const common_chat_template & tmpl, bool supports_tools)
    : analyze_base(tmpl) {
    LOG_DBG(ANSI_PURPLE "=== Starting differential analysis ===\n" ANSI_RESET);
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -358,35 +358,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
            if (is_potential_container) {
                value_content = normalize_container_value(value_content);
            }
-
-            // Try to parse as JSON value (number, bool, null, object, array)
-            try {
-                ordered_json parsed = ordered_json::parse(value_content);
-                if (parsed.is_string()) {
-                    // Don't add closing quote yet (added by arg_close) for monotonic streaming
-                    std::string escaped = parsed.dump();
-                    if (!escaped.empty() && escaped.back() == '"') {
-                        escaped.pop_back();
-                    }
-                    value_to_add          = escaped;
-                    closing_quote_pending = true;
-                } else {
-                    // Non-string values: use raw content to preserve whitespace for monotonicity
-                    value_to_add = value_content;
-                }
-            } catch (...) {
-                if (node.is_partial && is_potential_container) {
-                    // Partial container: pass through the already-normalized content
-                    value_to_add = value_content;
-                } else {
-                    // Not valid JSON - treat as string value
-                    if (!closing_quote_pending) {
-                        value_to_add          = "\"";
-                        closing_quote_pending = true;
-                    }
-                    value_to_add += escape_json_string_inner(value_content);
-                }
-            }
+            value_to_add += value_content;
        }

        args_target() += value_to_add;
@@ -813,7 +785,7 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
    if (delimiter.empty()) {
        return literal(s);
    }
-    return literal(s.substr(0, s.rfind(delimiter)));
+    return literal(s.substr(0, s.find(delimiter)));
 }

 common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
--- a/common/chat-peg-parser.h
+++ b/common/chat-peg-parser.h
@@ -90,7 +90,7 @@ class common_chat_peg_builder : public common_peg_parser_builder {

    // Use for schema-declared string types - won't be treated as potential JSON container
    common_peg_parser tool_arg_string_value(const common_peg_parser & p) { return tag(TOOL_ARG_STRING_VALUE, p); }
-    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return atomic(tag(TOOL_ARG_VALUE, p)); }
+    common_peg_parser tool_arg_json_value(const common_peg_parser & p) { return tag(TOOL_ARG_VALUE, p); }


    // Return a parser that parses the prefix of a string, up to a given delimiter.
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -70,6 +70,65 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) {
    return !msg.content.empty() || !msg.tool_calls.empty();
 }

+std::string common_chat_msg::render_content(const std::string & delimiter) const {
+    if (!content.empty() && !content_parts.empty()) {
+        throw std::runtime_error("Cannot specify both content and content_parts");
+    }
+    if (!content.empty()) {
+        return content;
+    }
+
+    std::string text;
+    for (const auto & part : content_parts) {
+        if (part.type == "text") {
+            if (!text.empty()) {
+                text += delimiter;
+            }
+            text += part.text;
+        }
+    }
+    return text;
+}
+
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims) {
+    if (delims.empty() || prompt.empty()) {
+        return {};
+    }
+
+    auto parser = build_peg_parser([&](common_peg_parser_builder & p) {
+        std::vector<std::string>       all_delims;
+        std::vector<common_peg_parser> tagged_messages;
+
+        all_delims.reserve(delims.size());
+        tagged_messages.reserve(delims.size());
+        for (const auto & d : delims) {
+            all_delims.push_back(d.delimiter);
+        }
+
+        auto any_delim = p.until_one_of(all_delims);
+        for (const auto & d : delims) {
+            tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim));
+        }
+
+        return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end();
+    });
+
+    common_peg_parse_context ctx(prompt);
+    const auto result = parser.parse(ctx);
+    if (!result.success()) {
+        return {};
+    }
+
+    std::vector<common_chat_msg_span> spans;
+    ctx.ast.visit(result, [&](const common_peg_ast_node & node) {
+        if (!node.tag.empty()) {
+            spans.push_back({ node.tag, node.start, node.end - node.start });
+        }
+    });
+
+    return spans;
+}
+
 json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
    if (!content.empty() && !content_parts.empty()) {
        throw std::runtime_error("Cannot specify both content and content_parts");
@@ -451,6 +510,22 @@ std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const json & too
    return result;
 }

+common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value) {
+    if (value.is_boolean() && value.get<bool>()) {
+        return COMMON_CHAT_CONTINUATION_AUTO;
+    }
+    if (value.is_string()) {
+        auto value_str = value.get<std::string>();
+        if (value_str == "reasoning_content") {
+            return COMMON_CHAT_CONTINUATION_REASONING;
+        }
+        if (value_str == "content") {
+            return COMMON_CHAT_CONTINUATION_CONTENT;
+        }
+    }
+    return COMMON_CHAT_CONTINUATION_NONE;
+}
+
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
    if (use_jinja) {
        try {
@@ -811,6 +886,36 @@ std::string common_chat_template_direct_apply(
    return common_chat_template_direct_apply_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
 }

+static std::string common_chat_template_generation_prompt_impl(
+    const common_chat_template & tmpl,
+    const autoparser::generation_params & inputs,
+    const std::optional<json> & messages_override = std::nullopt,
+    const std::optional<json> & tools_override = std::nullopt,
+    const std::optional<json> & additional_context = std::nullopt) {
+
+    auto adjusted_messages = messages_override ? *messages_override : inputs.messages;
+
+    autoparser::generation_params params = inputs;
+    params.add_generation_prompt = false;
+    params.continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
+    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context);
+    params.add_generation_prompt = true;
+    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params, adjusted_messages, tools_override, additional_context);
+
+    size_t prefix_len = 0;
+    size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
+    while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
+        prefix_len++;
+    }
+    return gen_prompt.substr(prefix_len);
+}
+
+std::string common_chat_template_generation_prompt(
+    const common_chat_template & tmpl,
+    const autoparser::generation_params & inputs) {
+    return common_chat_template_generation_prompt_impl(tmpl, inputs, std::nullopt, std::nullopt, std::nullopt);
+}
+
 static common_chat_params common_chat_params_init_ministral_3(const common_chat_template &    tmpl,
                                                              const autoparser::generation_params & inputs) {
    common_chat_params data;
@@ -863,6 +968,7 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
    data.thinking_start_tag = "[THINK]";
    data.thinking_end_tag   = "[/THINK]";
    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override = */ adjusted_messages);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens  = {
        "[THINK]",
@@ -871,8 +977,19 @@ static common_chat_params common_chat_params_init_ministral_3(const common_chat_
        "[ARGS]",
    };

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = "[THINK]" + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += "[/THINK]" + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, "[THINK]");
+        auto generation_prompt = p.eps();
        auto reasoning =
            extract_reasoning ? p.optional("[THINK]" + p.reasoning(p.until("[/THINK]")) + "[/THINK]") : p.eps();

@@ -963,6 +1080,15 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    }

    data.prompt            = prompt;
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages);
+    data.message_spans = common_chat_split_by_role(prompt, {
+        { "assistant", "<|start|>assistant" },
+        { "user",      "<|start|>user"      },
+        { "system",    "<|start|>developer" },
+        { "system",    "<|start|>system"    },
+        { "tool",      "<|start|>functions" },
+    });
+
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;

@@ -972,6 +1098,18 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
        "<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
    };

+    // Adjust prompt for continuation
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = "<|start|>assistant<|channel|>analysis<|message|>" + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += "<|end|><|start|>assistant<|channel|>final<|message|>" + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
@@ -1080,14 +1218,21 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);

    if (inputs.add_generation_prompt && string_ends_with(data.prompt, "<turn|>\n")) {
        // This may happen if the model generates content + tool_call, the
        // template does not add the model's next turn and confuses the model
        // from emitting its proper reasoning token sequence.
-        data.prompt += "<|turn>model\n";
+        data.generation_prompt = "<|turn>model\n";
+        data.prompt += data.generation_prompt;
    }

+    data.message_spans = common_chat_split_by_role(data.prompt, {
+        { "user",      "<|turn>user\n"  },
+        { "assistant", "<|turn>model\n" },
+    });
+
    data.format            = COMMON_CHAT_FORMAT_PEG_GEMMA4;
    data.supports_thinking  = true;
    data.thinking_start_tag = "<|channel>thought";
@@ -1101,13 +1246,25 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ
        "<|turn>",
    };

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = string_ends_with(data.prompt, "<turn|>\n") ? "<|turn>model\n" : "";
+        data.generation_prompt += "<|channel>thought\n" + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += "<channel|>" + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
    auto has_tools           = inputs.tools.is_array() && !inputs.tools.empty();
    auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
    auto include_grammar     = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
    auto extract_reasoning   = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto start = p.rule("start", p.prefix(inputs.generation_prompt, "<|channel>"));
+        auto start = p.rule("start", p.optional(p.literal("<|turn>model\n")));

        if (extract_reasoning) {
            p.rule("thought", p.literal("<|channel>thought") + p.space() + p.reasoning(p.until("<channel|>")) + p.literal("<channel|>"));
@@ -1224,15 +1381,22 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
                                                                   const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt           = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.preserved_tokens = {
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.preserved_tokens  = {
        ">>>all",
    };

    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+        data.generation_prompt = "<|start_header_id|>assistant<|end_header_id|>\n\n>>>all\n" + msg.render_content();
+        data.prompt += data.generation_prompt;
+    }
+
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // Functionary v3.2 format:
        // - Normal content: >>>all\n{content}
@@ -1244,7 +1408,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
        // When no tools, content goes until end
        auto content_until_tool = p.literal("all\n") + p.content(p.until(">>>"));
        auto content_until_end  = p.literal("all\n") + p.content(p.rest());
-        auto generation_prompt  = p.literal(inputs.generation_prompt);
+        auto generation_prompt  = p.literal("<|start_header_id|>assistant<|end_header_id|>\n\n>>>");

        // If no tools or tool_choice is NONE, just parse content
        if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
@@ -1318,9 +1482,10 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
                                                          const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking  = true;
+    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking = true;
    data.preserved_tokens  = {
        "<|tool_calls_section_begin|>",
        "<|tool_calls_section_end|>",
@@ -1343,10 +1508,22 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp

    const std::string THINK_START = "<think>";
    const std::string THINK_END   = "</think>";
+    const std::string GEN_PROMPT  = "<|im_assistant|>assistant<|im_middle|>";

    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
        // Kimi K2 Thinking format:
        // - Reasoning: <think>{reasoning}</think>
@@ -1366,7 +1543,7 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
        auto reasoning = extract_reasoning ? p.optional(THINK_START + p.reasoning(
            p.until_one_of({ THINK_END, "<|tool_calls_section_begin|>", "<|tool_call_begin|>" })) +
            p.optional(p.literal(THINK_END))) : p.eps();
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
+        auto generation_prompt = p.literal(GEN_PROMPT);


        // Content only parser (no tools)
@@ -1442,6 +1619,7 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@@ -1461,12 +1639,24 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
    const std::string TOOL_CALL_END   = "<|tool_call_end|>";
    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
+    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
+        auto generation_prompt = p.literal(GEN_PROMPT);
        auto end = p.end();

        auto reasoning = p.eps();
@@ -1521,6 +1711,7 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = true;
    data.preserved_tokens  = {
@@ -1536,12 +1727,24 @@ static common_chat_params common_chat_params_init_lfm2_5(const common_chat_templ

    const std::string THINK_START     = "<think>";
    const std::string THINK_END       = "</think>";
+    const std::string GEN_PROMPT      = "<|im_start|>assistant\n";

    data.thinking_start_tag = THINK_START;
    data.thinking_end_tag   = THINK_END;

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }
+
    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
+        auto generation_prompt = p.literal(GEN_PROMPT);
        auto end = p.end();

        auto reasoning = p.eps();
@@ -1592,6 +1795,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
    common_chat_params data;

    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs);
    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.supports_thinking = false;
    data.preserved_tokens  = {
@@ -1599,6 +1803,12 @@ static common_chat_params common_chat_params_init_gigachat_v3(
        "<|role_sep|>\n",
    };

+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+        data.generation_prompt = "assistant<|role_sep|>\n" + msg.render_content();
+        data.prompt += data.generation_prompt;
+    }
+
    auto has_tools         = inputs.tools.is_array() && !inputs.tools.empty();
    auto include_grammar   = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
    const auto *tool_call_start_prefix = "<|message_sep|>\n\nfunction call<|role_sep|>\n";
@@ -1634,7 +1844,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
            ret = p.content(p.rest());
        }

-        return p.literal(inputs.generation_prompt) + ret;
+        return p.literal("assistant<|role_sep|>\n") + ret;
    });

    data.parser = parser.save();
@@ -1662,12 +1872,13 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
                                                                 const autoparser::generation_params & inputs) {
    common_chat_params data;

-    data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
-    data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
-    data.supports_thinking = true;
+    data.prompt             = common_chat_template_direct_apply_impl(tmpl, inputs);
+    data.generation_prompt  = common_chat_template_generation_prompt_impl(tmpl, inputs);
+    data.format             = COMMON_CHAT_FORMAT_PEG_NATIVE;
+    data.supports_thinking  = true;
    data.thinking_start_tag = "<think>";
    data.thinking_end_tag   = "</think>";
-    data.preserved_tokens  = {
+    data.preserved_tokens   = {
        "｜DSML｜",
        "<think>",
        "</think>",
@@ -1687,9 +1898,21 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
    const std::string INVOKE_END   = "</" + DSML + "invoke>";
    const std::string PARAM_START  = "<" + DSML + "parameter";
    const std::string PARAM_END    = "</" + DSML + "parameter>";
+    const std::string GEN_PROMPT   = "<｜Assistant｜>";
+
+    if (inputs.has_continuation()) {
+        const auto & msg = inputs.continue_msg;
+
+        data.generation_prompt = GEN_PROMPT + THINK_START + msg.reasoning_content;
+        if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) {
+            data.generation_prompt += THINK_END + msg.render_content();
+        }
+
+        data.prompt += data.generation_prompt;
+    }

    auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
-        auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
+        auto generation_prompt = p.literal(GEN_PROMPT);
        auto end = p.end();

        auto reasoning = p.eps();
@@ -2116,21 +2339,6 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
    return std::nullopt;
 }

-static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) {
-    autoparser::generation_params params = inputs;
-    params.add_generation_prompt = false;
-    std::string no_gen_prompt    = common_chat_template_direct_apply_impl(tmpl, params);
-    params.add_generation_prompt = true;
-    std::string gen_prompt       = common_chat_template_direct_apply_impl(tmpl, params);
-
-    size_t prefix_len = 0;
-    size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
-    while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
-        prefix_len++;
-    }
-    return gen_prompt.substr(prefix_len);
-}
-
 static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates *        tmpls,
                                                            const struct common_chat_templates_inputs & inputs) {
    autoparser::generation_params params;
@@ -2149,6 +2357,27 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
    params.add_bos               = tmpls->add_bos;
    params.add_eos               = tmpls->add_eos;

+    params.continue_final_message = inputs.continue_final_message;
+    if (params.continue_final_message != COMMON_CHAT_CONTINUATION_NONE) {
+        params.add_generation_prompt = false;
+
+        if (!inputs.messages.empty()) {
+            // Render messages[:-1] and store continuation message separately
+            params.continue_msg = inputs.messages.back();
+            params.messages.erase(params.messages.size() - 1);
+        }
+
+        if (params.continue_final_message == COMMON_CHAT_CONTINUATION_AUTO && !inputs.messages.empty()) {
+            // Resolve based on message content
+            params.continue_final_message = COMMON_CHAT_CONTINUATION_CONTENT;
+            if (!params.continue_msg.reasoning_content.empty() &&
+                params.continue_msg.content.empty() &&
+                params.continue_msg.content_parts.empty()) {
+                params.continue_final_message = COMMON_CHAT_CONTINUATION_REASONING;
+            }
+        }
+    }
+
    if (src.find("<|channel|>") == std::string::npos) {
        // map developer to system for all models except for GPT-OSS
        workaround::map_developer_role_to_system(params.messages);
@@ -2169,8 +2398,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        workaround::func_args_not_string(params.messages);
    }

-    params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params);
-
    params.extra_context = common_chat_extra_context();
    for (auto el : inputs.chat_template_kwargs) {
        params.extra_context[el.first] = json::parse(el.second);
@@ -2200,17 +2427,16 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        auto params_copy               = params;
        params_copy.reasoning_format   = COMMON_REASONING_FORMAT_NONE;
        data.prompt                    = common_chat_template_direct_apply_impl(tmpl, params_copy);
+        data.generation_prompt         = common_chat_template_generation_prompt_impl(tmpl, params);
        data.format                    = COMMON_CHAT_FORMAT_PEG_NATIVE;
-        data.generation_prompt         = params.generation_prompt;
-        auto parser                    = build_chat_peg_parser([&params](common_chat_peg_builder &p) {
-            return p.prefix(params.generation_prompt) << p.content(p.rest());
+        auto parser                    = build_chat_peg_parser([&data](common_chat_peg_builder &p) {
+            return p.literal(data.generation_prompt) << p.content(p.rest());
        });
        data.parser                    = parser.save();
        return data;
    }

    if (auto result = common_chat_try_specialized_template(tmpl, src, params)) {
-        result->generation_prompt = params.generation_prompt;
        return *result;
    }

@@ -2219,12 +2445,24 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
        struct autoparser::autoparser autoparser;
        autoparser.analyze_template(tmpl);
        auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
+
+        std::vector<common_chat_msg_delimiter> delimiters;
+        if (!autoparser.assistant_start.empty()) {
+            delimiters.push_back({ "assistant", autoparser.assistant_start });
+        }
+        if (!autoparser.user_start.empty()) {
+            delimiters.push_back({ "user", autoparser.user_start });
+        }
+
+        if (!delimiters.empty()) {
+            auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters);
+        }
+
        auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
        if (auto_params.supports_thinking) {
            auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
            auto_params.thinking_end_tag   = trim_whitespace(autoparser.reasoning.end);
        }
-        auto_params.generation_prompt = params.generation_prompt;
        common_peg_arena arena;
        arena.load(auto_params.parser);
        LOG_DBG("%s: generated parser:\n%s\n\nparser generation prompt: %s\n", __func__, arena.dump(arena.root()).c_str(), auto_params.generation_prompt.c_str());
--- a/common/chat.h
+++ b/common/chat.h
@@ -89,6 +89,8 @@ struct common_chat_msg {

    nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const;

+    std::string render_content(const std::string & delimiter = "\n\n") const;
+
    bool empty() const {
        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() &&
               tool_name.empty() && tool_call_id.empty();
@@ -141,6 +143,17 @@ struct common_chat_msg_diff {
    }
 };

+struct common_chat_msg_span {
+    std::string role;
+    std::size_t pos = 0;
+    std::size_t len = 0;
+};
+
+struct common_chat_msg_delimiter {
+    std::string role;
+    std::string delimiter;
+};
+
 struct common_chat_tool {
    std::string name;
    std::string description;
@@ -164,12 +177,22 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_COUNT,  // Not a format, just the # formats
 };

+
+// Continuation method provided via `continue_final_message`
+enum common_chat_continuation {
+    COMMON_CHAT_CONTINUATION_NONE,
+    COMMON_CHAT_CONTINUATION_AUTO,
+    COMMON_CHAT_CONTINUATION_REASONING,
+    COMMON_CHAT_CONTINUATION_CONTENT,
+};
+
 struct common_chat_templates_inputs {
    std::vector<common_chat_msg>          messages;
    std::string                           grammar;
    std::string                           json_schema;
-    bool                                  add_generation_prompt = true;
-    bool                                  use_jinja             = true;
+    bool                                  add_generation_prompt  = true;
+    common_chat_continuation              continue_final_message = COMMON_CHAT_CONTINUATION_NONE;
+    bool                                  use_jinja              = true;
    // Parameters below only supported when use_jinja is true
    std::vector<common_chat_tool>         tools;
    common_chat_tool_choice               tool_choice         = COMMON_CHAT_TOOL_CHOICE_AUTO;
@@ -196,6 +219,7 @@ struct common_chat_params {
    std::vector<std::string>            preserved_tokens;
    std::vector<std::string>            additional_stops;
    std::string                         parser;
+    std::vector<common_chat_msg_span>   message_spans;
 };

 // per-message parsing syntax
@@ -207,6 +231,8 @@ struct common_chat_parser_params {
    bool                    reasoning_in_content = false;
    std::string             generation_prompt;
    bool                    parse_tool_calls     = true;
+    bool                    is_continuation      = false;
+    bool                    echo                 = false;  // Include assistant prefilled msg in output
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
@@ -267,6 +293,8 @@ std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const nlohmann::or

 std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools);

+common_chat_continuation common_chat_continuation_parse(const nlohmann::ordered_json & value);
+
 // DEPRECATED: only used in tests
 nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);

@@ -279,11 +307,16 @@ std::string common_chat_template_direct_apply(
    const common_chat_template & tmpl,
    const autoparser::generation_params & inputs);

+std::string common_chat_template_generation_prompt(
+    const common_chat_template &          tmpl,
+    const autoparser::generation_params & inputs);
+
 std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);

+
 // specialized per-task preset
 struct common_chat_prompt_preset {
    std::string system;
@@ -291,3 +324,6 @@ struct common_chat_prompt_preset {
 };

 common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
+
+std::vector<common_chat_msg_span> common_chat_split_by_role(const std::string & prompt, const std::vector<common_chat_msg_delimiter> & delims);
+
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,6 +7,7 @@
 #include "log.h"
 #include "llama.h"
 #include "sampling.h"
+#include "speculative.h"
 #include "unicode.h"

 #include <algorithm>
@@ -366,15 +367,33 @@ void common_init() {
    SetConsoleCP(CP_UTF8);
 #endif

-    llama_log_set(common_log_default_callback, NULL);
+    common_log_set_prefix(common_log_main(), true);
+    common_log_set_timestamps(common_log_main(), true);

+    llama_log_set(common_log_default_callback, NULL);
+}
+
+void common_params_print_info(const common_params & params, bool print_devices) {
 #ifdef NDEBUG
    const char * build_type = "";
 #else
    const char * build_type = " (debug)";
 #endif
+    LOG_TRC("%s: build %d (%s) with %s for %s%s\n", __func__, llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);

-    LOG_DBG("build: %d (%s) with %s for %s%s\n", llama_build_number(), llama_commit(), llama_compiler(), llama_build_target(), build_type);
+    LOG_INF("log_info: verbosity = %d (adjust with the `-lv N` CLI arg)\n", common_log_get_verbosity_thold());
+
+    // device enumeration creates a primary context on CUDA backends, skip it when the caller does not own any device
+    if (print_devices) {
+        LOG_INF("device_info:\n");
+        for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+            auto * dev = ggml_backend_dev_get(i);
+            size_t free, total;
+            ggml_backend_dev_memory(dev, &free, &total);
+            LOG_INF("  - %-8s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+        }
+    }
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
 }

 std::string common_params_get_system_info(const common_params & params) {
@@ -426,6 +445,27 @@ std::string string_strip(const std::string & str) {
    return str.substr(start, end - start);
 }

+std::string string_lcs(std::string_view a, std::string_view b) {
+    if (a.empty() || b.empty()) return {};
+
+    std::vector<std::vector<size_t>> dp(a.size() + 1, std::vector<size_t>(b.size() + 1, 0));
+    size_t best_len = 0;
+    size_t best_end_a = 0;
+
+    for (size_t i = 1; i <= a.size(); ++i) {
+        for (size_t j = 1; j <= b.size(); ++j) {
+            if (a[i - 1] == b[j - 1]) {
+                dp[i][j] = dp[i - 1][j - 1] + 1;
+                if (dp[i][j] > best_len) {
+                    best_len = dp[i][j];
+                    best_end_a = i;
+                }
+            }
+        }
+    }
+    return std::string(a.substr(best_end_a - best_len, best_len));
+}
+
 std::string string_get_sortable_timestamp() {
    using clock = std::chrono::system_clock;

@@ -1141,19 +1181,20 @@ struct common_init_result::impl {
    std::vector<llama_sampler_seq_config> samplers_seq_config;
 };

-common_init_result::common_init_result(common_params & params) :
+common_init_result::common_init_result(common_params & params, bool model_only) :
    pimpl(new impl{}) {
    auto mparams = common_model_params_to_llama(params);
    auto cparams = common_context_params_to_llama(params);

    if (params.fit_params) {
-        LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__);
+        LOG_INF("%s: fitting params to device memory ...\n", __func__);
+        LOG_INF("%s: (for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on)\n", __func__);
        common_fit_params(params.model.path.c_str(), &mparams, &cparams,
            params.tensor_split,
            params.tensor_buft_overrides.data(),
            params.fit_params_target.data(),
            params.fit_params_min_ctx,
-            params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
+            params.verbosity >= LOG_LEVEL_DEBUG ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
    }

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
@@ -1163,6 +1204,10 @@ common_init_result::common_init_result(common_params & params) :

    pimpl->model.reset(model);

+    if (model_only) {
+        return;
+    }
+
    const llama_vocab * vocab = llama_model_get_vocab(model);

    // load and optionally apply lora adapters
@@ -1196,7 +1241,7 @@ common_init_result::common_init_result(common_params & params) :
    // initialize once
    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            LOG_TRC("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
        }
    }
@@ -1209,12 +1254,12 @@ common_init_result::common_init_result(common_params & params) :
    }

    //if (params.sampling.penalty_last_n == -1) {
-    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
    //}

    //if (params.sampling.dry_penalty_last_n == -1) {
-    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    LOG_TRC("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
    //}

@@ -1266,8 +1311,8 @@ std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
    return pimpl->lora;
 }

-common_init_result_ptr common_init_from_params(common_params & params) {
-    common_init_result_ptr res(new common_init_result(params));
+common_init_result_ptr common_init_from_params(common_params & params, bool model_only) {
+    common_init_result_ptr res(new common_init_result(params, model_only));

    llama_model * model = res->model();
    if (model == NULL) {
@@ -1275,6 +1320,10 @@ common_init_result_ptr common_init_from_params(common_params & params) {
        return res;
    }

+    if (model_only) {
+        return res;
+    }
+
    llama_context * lctx = res->context();
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
@@ -1338,7 +1387,7 @@ common_init_result_ptr common_init_from_params(common_params & params) {
    }

    if (params.warmup) {
-        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
+        LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

        llama_set_warmup(lctx, true);

@@ -1420,9 +1469,15 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {
        goto done;
    }

+    if (llama_n_rs_seq(ctx) > 0) {
+        LOG_INF("%s: the context supports bounded partial sequence removal\n", __func__);
+        res = COMMON_CONTEXT_SEQ_RM_TYPE_RS;
+        goto done;
+    }
+
    // try to remove the last tokens
    if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
-        LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
+        LOG_TRC("%s: the context does not support partial sequence removal\n", __func__);
        res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
        goto done;
    }
@@ -1434,6 +1489,23 @@ done:
    return res;
 }

+void common_context_seq_rm(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    auto * mem = llama_get_memory(ctx);
+    if (!llama_memory_seq_rm(mem, seq_id, p0, p1)) {
+        GGML_ABORT("%s", string_format("failed to remove sequence %d with p0=%d, p1=%d\n", seq_id, p0, p1).c_str());
+    }
+}
+
+void common_context_seq_cp(llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    auto * mem = llama_get_memory(ctx);
+    llama_memory_seq_cp(mem, seq_id_src, seq_id_dst, p0, p1);
+}
+
+void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
+    auto * mem = llama_get_memory(ctx);
+    llama_memory_seq_add(mem, seq_id, p0, p1, delta);
+}
+
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
    std::vector<llama_adapter_lora *> loras;
    std::vector<float> scales;
@@ -1490,6 +1562,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &

    cparams.n_ctx             = params.n_ctx;
    cparams.n_seq_max         = params.n_parallel;
+    cparams.n_rs_seq          = params.speculative.need_n_rs_seq();
    cparams.n_batch           = params.n_batch;
    cparams.n_ubatch          = params.n_ubatch;
    cparams.n_threads         = params.cpuparams.n_threads;
@@ -2059,3 +2132,11 @@ void common_prompt_checkpoint::load_dft(
        GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
    }
 }
+
+void common_prompt_checkpoint::clear_tgt() {
+    data_tgt.clear();
+}
+
+void common_prompt_checkpoint::clear_dft() {
+    data_dft.clear();
+}
--- a/common/common.h
+++ b/common/common.h
@@ -13,6 +13,7 @@
 #include <string_view>
 #include <vector>
 #include <map>
+#include <algorithm>

 #if defined(_WIN32) && !defined(_WIN32_WINNT)
 #define _WIN32_WINNT 0x0A00
@@ -159,6 +160,7 @@ enum common_speculative_type {
    COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
    COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
@@ -297,11 +299,13 @@ struct common_params_model {

 // draft-model-based speculative decoding parameters
 struct common_params_speculative_draft {
-    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min = 0;  // minimum number of draft tokens to use for speculative decoding
+    int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding

-    float p_split = 0.1f;  // speculative decoding split probability
-    float p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+    float p_split = 0.1f; // speculative decoding split probability
+    float p_min   = 0.0f; // minimum speculative decoding probability (greedy)
+
+    bool backend_sampling = true; // offload draft sampling to the backend (default: on)

    common_params_model mparams;

@@ -355,6 +359,14 @@ struct common_params_speculative {
    bool has_dft() const {
        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
    }
+
+    uint32_t need_n_rs_seq() const {
+        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
+        });
+
+        return needs_rs_seq ? draft.n_max : 0u;
+    }
 };

 struct common_params_vocoder {
@@ -582,7 +594,7 @@ struct common_params {
    bool    cache_prompt        = true;  // whether to enable prompt caching
    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
    int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
-    int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
+    int32_t checkpoint_min_step = 256;   // minimum spacing between context checkpoints
    int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
@@ -604,11 +616,17 @@ struct common_params {

    std::map<std::string, std::string> default_template_kwargs;

-    // webui configs
-    bool webui = true;
+    // UI configs
+    bool ui = true;
+
+    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
+    bool webui = ui;
    bool webui_mcp_proxy = false;
    std::string webui_config_json;

+    bool ui_mcp_proxy = false;
+    std::string ui_config_json;
+
    // "advanced" endpoints are disabled by default for better security
    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
@@ -686,6 +704,7 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();

+void common_params_print_info(const common_params & params, bool print_devices = true);
 std::string common_params_get_system_info(const common_params & params);

 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -712,6 +731,7 @@ std::string string_format(const char * fmt, ...);

 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
+std::string string_lcs(std::string_view a, std::string_view b);

 std::string string_join(const std::vector<std::string> & values, const std::string & separator);
 std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
@@ -836,7 +856,7 @@ struct common_sampler;

 // note: defines the model, context, samplers, ets. lifetimes
 struct common_init_result {
-    common_init_result(common_params & params);
+    common_init_result(common_params & params, bool model_only = false);
    ~common_init_result();

    llama_model * model();
@@ -854,7 +874,7 @@ private:

 using common_init_result_ptr = std::unique_ptr<common_init_result>;

-common_init_result_ptr common_init_from_params(common_params & params);
+common_init_result_ptr common_init_from_params(common_params & params, bool model_only = false);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
@@ -871,15 +891,20 @@ std::string common_get_model_endpoint();
 //

 enum common_context_seq_rm_type {
-    COMMON_CONTEXT_SEQ_RM_TYPE_NO   = 0, // seq_rm not supported (e.g. no memory module)
-    COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences
-    COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only
+    COMMON_CONTEXT_SEQ_RM_TYPE_NO           = 0, // seq_rm not supported (e.g. no memory module)
+    COMMON_CONTEXT_SEQ_RM_TYPE_PART         = 1, // can seq_rm partial sequences
+    COMMON_CONTEXT_SEQ_RM_TYPE_FULL         = 2, // can seq_rm full sequences only
+    COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq
 };

 // check if the llama_context can remove sequences
 // note: clears the memory of the context
 common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);

+// aborts execution on failure
+void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);

 //
 // Batch utils
@@ -1061,4 +1086,7 @@ struct common_prompt_checkpoint {
            llama_context * ctx,
            llama_seq_id seq_id,
            llama_state_seq_flags flags) const;
+
+    void clear_tgt();
+    void clear_dft();
 };
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -320,9 +320,9 @@ static int common_download_file_single_online(const std::string & url,

    auto head = cli.Head(parts.path);
    if (!head || head->status < 200 || head->status >= 300) {
-        LOG_WRN("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
+        LOG_TRC("%s: HEAD failed, status: %d\n", __func__, head ? head->status : -1);
        if (file_exists) {
-            LOG_INF("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
+            LOG_TRC("%s: using cached file (HEAD failed): %s\n", __func__, path.c_str());
            return 304; // 304 Not Modified - fake cached response
        }
        return head ? head->status : -1;
@@ -566,8 +566,11 @@ static hf_cache::hf_files get_split_files(const hf_cache::hf_files & files,
    return result;
 }

-static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
-                                          const std::string        & model) {
+// pick the best sibling GGUF whose filename contains `keyword` (e.g. "mmproj" / "mtp"),
+// preferring deeper shared directory prefix with the model, then closest quantization
+static hf_cache::hf_file find_best_sibling(const hf_cache::hf_files & files,
+                                           const std::string        & model,
+                                           const std::string        & keyword) {
    hf_cache::hf_file best;
    size_t best_depth = 0;
    int best_diff = 0;
@@ -579,20 +582,20 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,

    for (const auto & f : files) {
        if (!string_ends_with(f.path, ".gguf") ||
-            f.path.find("mmproj") == std::string::npos) {
+            f.path.find(keyword) == std::string::npos) {
            continue;
        }

-        auto mmproj_parts = string_split<std::string>(f.path, '/');
-        auto mmproj_dir = mmproj_parts.end() - 1;
+        auto sib_parts = string_split<std::string>(f.path, '/');
+        auto sib_dir = sib_parts.end() - 1;

        auto [_, dir] = std::mismatch(model_parts.begin(), model_dir,
-                                      mmproj_parts.begin(), mmproj_dir);
-        if (dir != mmproj_dir) {
+                                      sib_parts.begin(), sib_dir);
+        if (dir != sib_dir) {
            continue;
        }

-        size_t depth = dir - mmproj_parts.begin();
+        size_t depth = dir - sib_parts.begin();
        auto bits = extract_quant_bits(f.path);
        auto diff = std::abs(bits - model_bits);

@@ -606,6 +609,16 @@ static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
    return best;
 }

+static hf_cache::hf_file find_best_mmproj(const hf_cache::hf_files & files,
+                                          const std::string        & model) {
+    return find_best_sibling(files, model, "mmproj");
+}
+
+static hf_cache::hf_file find_best_mtp(const hf_cache::hf_files & files,
+                                       const std::string        & model) {
+    return find_best_sibling(files, model, "mtp-");
+}
+
 static bool gguf_filename_is_model(const std::string & filepath) {
    if (!string_ends_with(filepath, ".gguf")) {
        return false;
@@ -617,7 +630,8 @@ static bool gguf_filename_is_model(const std::string & filepath) {
    }

    return filename.find("mmproj")  == std::string::npos &&
-           filename.find("imatrix") == std::string::npos;
+           filename.find("imatrix") == std::string::npos &&
+           filename.find("mtp-")    == std::string::npos;
 }

 static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
@@ -673,11 +687,13 @@ struct hf_plan {
    hf_cache::hf_file primary;
    hf_cache::hf_files model_files;
    hf_cache::hf_file mmproj;
+    hf_cache::hf_file mtp;
 };

 static hf_plan get_hf_plan(const common_params_model  & model,
                           const common_download_opts & opts,
-                           bool download_mmproj) {
+                           bool download_mmproj,
+                           bool download_mtp) {
    hf_plan plan;
    hf_cache::hf_files all;

@@ -723,6 +739,10 @@ static hf_plan get_hf_plan(const common_params_model  & model,
        plan.mmproj = find_best_mmproj(all, primary.path);
    }

+    if (download_mtp) {
+        plan.mtp = find_best_mtp(all, primary.path);
+    }
+
    return plan;
 }

@@ -756,7 +776,8 @@ static std::vector<download_task> get_url_tasks(const common_params_model & mode

 common_download_model_result common_download_model(const common_params_model  & model,
                                                   const common_download_opts & opts,
-                                                   bool download_mmproj) {
+                                                   bool download_mmproj,
+                                                   bool download_mtp) {
    common_download_model_result result;
    std::vector<download_task> tasks;
    hf_plan hf;
@@ -764,13 +785,16 @@ common_download_model_result common_download_model(const common_params_model  &
    bool is_hf = !model.hf_repo.empty();

    if (is_hf) {
-        hf = get_hf_plan(model, opts, download_mmproj);
+        hf = get_hf_plan(model, opts, download_mmproj, download_mtp);
        for (const auto & f : hf.model_files) {
            tasks.push_back({f.url, f.local_path});
        }
        if (!hf.mmproj.path.empty()) {
            tasks.push_back({hf.mmproj.url, hf.mmproj.local_path});
        }
+        if (!hf.mtp.path.empty()) {
+            tasks.push_back({hf.mtp.url, hf.mtp.local_path});
+        }
    } else if (!model.url.empty()) {
        tasks = get_url_tasks(model);
    } else {
@@ -807,6 +831,10 @@ common_download_model_result common_download_model(const common_params_model  &
        if (!hf.mmproj.path.empty()) {
            result.mmproj_path = hf_cache::finalize_file(hf.mmproj);
        }
+
+        if (!hf.mtp.path.empty()) {
+            result.mtp_path = hf_cache::finalize_file(hf.mtp);
+        }
    } else {
        result.model_path = model.path;
    }
@@ -946,7 +974,8 @@ std::vector<common_cached_model_info> common_list_cached_models() {
    for (const auto & f : files) {
        auto split = get_gguf_split_info(f.path);
        if (split.index != 1 || split.tag.empty() ||
-            split.prefix.find("mmproj") != std::string::npos) {
+            split.prefix.find("mmproj") != std::string::npos ||
+            split.prefix.find("mtp-")   != std::string::npos) {
            continue;
        }
        if (seen.insert(f.repo_id + ":" + split.tag).second) {
--- a/common/download.h
+++ b/common/download.h
@@ -59,6 +59,7 @@ struct common_download_opts {
 struct common_download_model_result {
    std::string model_path;
    std::string mmproj_path;
+    std::string mtp_path;
 };

 // Download model from HuggingFace repo or URL
@@ -83,12 +84,14 @@ struct common_download_model_result {
 // when opts.offline=true, no network requests are made
 // when download_mmproj=true, searches for mmproj in same directory as model or any parent directory
 // then with the closest quantization bits
+// when download_mtp=true, applies the same sibling search for an MTP-head GGUF
 //
-// returns result with model_path and mmproj_path (empty on failure)
+// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure)
 common_download_model_result common_download_model(
    const common_params_model & model,
    const common_download_opts & opts = {},
-    bool download_mmproj = false
+    bool download_mmproj = false,
+    bool download_mtp    = false
 );

 // returns list of cached models
--- a/common/fit.cpp
+++ b/common/fit.cpp
@@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error {
    using std::runtime_error::runtime_error;
 };

-static std::vector<llama_device_memory_data> common_get_device_memory_data(
+std::vector<llama_device_memory_data> common_get_device_memory_data(
        const char * path_model,
        const llama_model_params * mparams,
        const llama_context_params * cparams,
@@ -168,7 +168,7 @@ static void common_params_fit_impl(

    // step 1: get data for default parameters and check whether any changes are necessary in the first place

-    LOG_INF("%s: getting device memory data for initial parameters:\n", __func__);
+    LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__);
    const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
    const size_t nd = devs.size(); // number of devices

@@ -213,13 +213,13 @@ static void common_params_fit_impl(
        LOG_INF("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (sum_projected_free >= margins[0]) {
-            LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+            LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
                __func__, sum_projected_free/MiB, margins[0]/MiB);
            return;
        }
    } else {
        if (nd > 1) {
-            LOG_INF("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+            LOG_TRC("%s: projected memory use with initial parameters [MiB]:\n", __func__);
        }
        for (size_t id = 0; id < nd; id++) {
            const llama_device_memory_data & dmd = dmds_full[id];
@@ -234,16 +234,16 @@ static void common_params_fit_impl(
            sum_projected_model += dmd.mb.model;

            if (nd > 1) {
-                LOG_INF("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                LOG_TRC("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
            }
        }
        assert(sum_free >= 0 && sum_projected_used >= 0);
-        LOG_INF("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+        LOG_TRC("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
            __func__, sum_projected_used/MiB, sum_free/MiB);
        if (nd == 1) {
            if (projected_free_per_device[0] >= margins[0]) {
-                LOG_INF("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                LOG_TRC("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
                return;
            }
@@ -256,7 +256,7 @@ static void common_params_fit_impl(
                }
            }
            if (!changes_needed) {
-                LOG_INF("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                LOG_TRC("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
                return;
            }
        }
@@ -275,10 +275,10 @@ static void common_params_fit_impl(
        }
        if (global_surplus < 0) {
            if (nd <= 1) {
-                LOG_INF("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
+                LOG_TRC("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
                    __func__, margins[0]/MiB, -global_surplus/MiB);
            } else {
-                LOG_INF(
+                LOG_TRC(
                    "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
                    __func__, -global_surplus/MiB);
            }
@@ -320,28 +320,28 @@ static void common_params_fit_impl(

                        const int64_t bytes_per_ctx = (sum_projected_used - sum_projected_used_min_ctx) / (hp_nct - n_ctx_min);
                        const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                        if (nd <= 1) {
-                            LOG_INF("%s: entire model can be fit by reducing context\n", __func__);
+                            LOG_TRC("%s: entire model can be fit by reducing context\n", __func__);
                            return;
                        }
-                        LOG_INF("%s: entire model should be fit across devices by reducing context\n", __func__);
+                        LOG_TRC("%s: entire model should be fit across devices by reducing context\n", __func__);
                    } else {
                        const int64_t memory_reduction = sum_projected_used - sum_projected_used_min_ctx;
-                        LOG_INF("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
+                        LOG_TRC("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                            __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
                    }
                } else {
                    if (n_ctx_min == UINT32_MAX) {
-                        LOG_INF("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+                        LOG_TRC("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
                    } else {
-                        LOG_INF("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+                        LOG_TRC("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
                            __func__, hp_nct, n_ctx_min);
                    }
                }
            } else {
-                LOG_INF("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
+                LOG_TRC("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
            }
        }
    }
@@ -485,10 +485,10 @@ static void common_params_fit_impl(
        const dmds_t dmd_nl = common_get_device_memory_data(
            path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

-        LOG_INF("%s: memory for test allocation by device:\n", func_name);
+        LOG_TRC("%s: memory for test allocation by device:\n", func_name);
        for (size_t id = 0; id < nd; id++) {
            const ngl_t & n = ngl_per_device[id];
-            LOG_INF(
+            LOG_TRC(
                "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
                func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
        }
@@ -509,7 +509,7 @@ static void common_params_fit_impl(
        tensor_buft_overrides[1] = {nullptr, nullptr};
        mparams->tensor_buft_overrides = tensor_buft_overrides;

-        LOG_INF("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
+        LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
        const dmds_t dmds_cpu_moe = common_get_device_memory_data(
            path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);

@@ -519,10 +519,10 @@ static void common_params_fit_impl(
        }

        if (global_surplus_cpu_moe > 0) {
-            LOG_INF("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
                __func__, global_surplus_cpu_moe/MiB);
        } else {
-            LOG_INF("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
+            LOG_TRC("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
                __func__, -global_surplus_cpu_moe/MiB);
        }

@@ -535,7 +535,7 @@ static void common_params_fit_impl(
    targets.reserve(nd);
    for (size_t id = 0; id < nd; id++) {
        targets.push_back(dmds_full[id].free - margins[id]);
-        LOG_INF("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
+        LOG_TRC("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
    }

    std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
@@ -555,9 +555,9 @@ static void common_params_fit_impl(
    //   - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
    //   - the last device has the output layer, which cannot be a partial layer
    if (hp_nex == 0) {
-        LOG_INF("%s: filling dense layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense layers back-to-front:\n", __func__);
    } else {
-        LOG_INF("%s: filling dense-only layers back-to-front:\n", __func__);
+        LOG_TRC("%s: filling dense-only layers back-to-front:\n", __func__);
    }
    for (int id = nd - 1; id >= 0; id--) {
        uint32_t n_unassigned = hp_ngl + 1;
@@ -576,7 +576,7 @@ static void common_params_fit_impl(
            if (mem_high[id] > targets[id]) {
                assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
                uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
-                LOG_INF("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
+                LOG_TRC("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
                while (delta > 1) {
                    uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
                    step_size = std::max(step_size, uint32_t(1));
@@ -593,11 +593,11 @@ static void common_params_fit_impl(
                    if (mem_test[id] <= targets[id]) {
                        ngl_per_device = ngl_per_device_test;
                        mem            = mem_test;
-                        LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
                    } else {
                        ngl_per_device_high = ngl_per_device_test;
                        mem_high            = mem_test;
-                        LOG_INF("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
+                        LOG_TRC("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
                    }
                    delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
                }
@@ -605,12 +605,12 @@ static void common_params_fit_impl(
                assert(ngl_per_device_high[id].n_layer == n_unassigned);
                ngl_per_device = ngl_per_device_high;
                mem            = mem_high;
-                LOG_INF("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
+                LOG_TRC("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
            }
        }

        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
    }
@@ -634,7 +634,7 @@ static void common_params_fit_impl(
    }
    assert(id_dense_start < nd);

-    LOG_INF("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
+    LOG_TRC("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
    for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
        std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
        for (size_t jd = id_dense_start; jd < nd; jd++) {
@@ -674,13 +674,13 @@ static void common_params_fit_impl(
                    ngl_per_device = ngl_per_device_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                } else {
                    ngl_per_device_high = ngl_per_device_test;
                    mem_high            = mem_test;
                    id_dense_start_high = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
                        __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
                }
                assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
@@ -690,7 +690,7 @@ static void common_params_fit_impl(
            ngl_per_device = ngl_per_device_high;
            mem            = mem_high;
            id_dense_start = id_dense_start_high;
-            LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
+            LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
                __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
        }

@@ -710,44 +710,44 @@ static void common_params_fit_impl(
            if (id < nd - 1) {
                overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
            }
-            LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
+            LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
            std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
            if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                ngl_per_device = ngl_per_device_test;
                overflow_bufts = overflow_bufts_test;
                mem            = mem_test;
                id_dense_start = id_dense_start_test;
-                LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
+                LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
                    __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);

                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                }
            } else {
                ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
-                LOG_INF("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
+                LOG_TRC("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
                mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
                if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
                    ngl_per_device = ngl_per_device_test;
                    overflow_bufts = overflow_bufts_test;
                    mem            = mem_test;
                    id_dense_start = id_dense_start_test;
-                    LOG_INF("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
+                    LOG_TRC("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
                        __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
                }
            }
        }

        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
@@ -755,7 +755,7 @@ static void common_params_fit_impl(
    // print info for devices that were not changed during the conversion from dense only to full layers:
    for (size_t id = id_dense_start + 1; id < nd; id++) {
        const int64_t projected_margin = dmds_full[id].free - mem[id];
-        LOG_INF(
+        LOG_TRC(
            "%s:   - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
            __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
    }
@@ -776,7 +776,7 @@ enum common_params_fit_status common_fit_params(
    common_params_fit_status status = COMMON_PARAMS_FIT_STATUS_SUCCESS;
    try {
        common_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
-        LOG_INF("%s: successfully fit params to free device memory\n", __func__);
+        LOG_TRC("%s: successfully fit params to free device memory\n", __func__);
    } catch (const common_params_fit_exception & e) {
        LOG_WRN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
        status = COMMON_PARAMS_FIT_STATUS_FAILURE;
@@ -785,7 +785,7 @@ enum common_params_fit_status common_fit_params(
        status = COMMON_PARAMS_FIT_STATUS_ERROR;
    }
    const int64_t t1_us = llama_time_us();
-    LOG_INF("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
+    LOG_TRC("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
    return status;
 }

@@ -925,7 +925,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
        }
    }
    for (const auto & td : table_data) {
-        LOG_INF(td[0].c_str(),
+        LOG_TRC(td[0].c_str(),
            __func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
            td[6].c_str(), td[7].c_str(), td[8].c_str());
    }
--- a/common/fit.h
+++ b/common/fit.h
@@ -1,6 +1,11 @@
 #pragma once

 #include "ggml.h"
+#include "ggml-backend.h"
+#include "llama.h"
+#include "../src/llama-ext.h"
+
+#include <vector>

 enum common_params_fit_status {
    COMMON_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
@@ -30,3 +35,14 @@ void common_fit_print(
                struct llama_context_params * cparams);

 void common_memory_breakdown_print(const struct llama_context * ctx);
+
+// Load a model + context with no_alloc and return the per-device memory breakdown.
+std::vector<llama_device_memory_data> common_get_device_memory_data(
+                                  const char   * path_model,
+        const struct llama_model_params         * mparams,
+        const struct llama_context_params       * cparams,
+        std::vector<ggml_backend_dev_t>         & devs,
+                                      uint32_t  & hp_ngl,
+                                      uint32_t  & hp_n_ctx_train,
+                                      uint32_t  & hp_n_expert,
+                           enum ggml_log_level    log_level);
--- a/common/hf-cache.cpp
+++ b/common/hf-cache.cpp
@@ -11,7 +11,6 @@
 #include <filesystem>
 #include <fstream>
 #include <atomic>
-#include <regex> // migration only
 #include <string>
 #include <string_view>
 #include <stdexcept>
@@ -336,15 +335,9 @@ hf_files get_repo_files(const std::string & repo_id,
                if (item["lfs"].contains("oid") && item["lfs"]["oid"].is_string()) {
                    file.oid = item["lfs"]["oid"].get<std::string>();
                }
-                if (item["lfs"].contains("size") && item["lfs"]["size"].is_number()) {
-                    file.size = item["lfs"]["size"].get<size_t>();
-                }
            } else if (item.contains("oid") && item["oid"].is_string()) {
                file.oid = item["oid"].get<std::string>();
            }
-            if (file.size == 0 && item.contains("size") && item["size"].is_number()) {
-                file.size = item["size"].get<size_t>();
-            }

            if (!file.oid.empty() && !is_valid_oid(file.oid)) {
                LOG_WRN("%s: skip invalid oid: %s\n", __func__, file.oid.c_str());
@@ -502,271 +495,4 @@ std::string finalize_file(const hf_file & file) {
    return file.final_path;
 }

-// delete everything after this line, one day
-
-// copied from download.cpp without the tag part
-struct gguf_split_info {
-    std::string prefix; // tag included
-    int index;
-    int count;
-};
-
-static gguf_split_info get_gguf_split_info(const std::string & path) {
-    static const std::regex re_split("^(.+)-([0-9]{5})-of-([0-9]{5})$", std::regex::icase);
-    std::smatch m;
-
-    std::string prefix = path;
-    if (!string_remove_suffix(prefix, ".gguf")) {
-        return {};
-    }
-
-    int index = 1;
-    int count = 1;
-
-    if (std::regex_match(prefix, m, re_split)) {
-        index = std::stoi(m[2].str());
-        count = std::stoi(m[3].str());
-        prefix = m[1].str();
-    }
-
-    return {std::move(prefix), index, count};
-}
-
-static std::pair<std::string, std::string> parse_manifest_name(std::string & filename) {
-    static const std::regex re(R"(^manifest=([^=]+)=([^=]+)=.*\.json$)");
-    std::smatch match;
-    if (std::regex_match(filename, match, re)) {
-        return {match[1].str(), match[2].str()};
-    }
-    return {};
-}
-
-static std::string make_old_cache_filename(const std::string & owner,
-                                           const std::string & repo,
-                                           const std::string & filename) {
-    auto result = owner + "_" + repo + "_" + filename;
-    string_replace_all(result, "/", "_");
-    return result;
-}
-
-struct migrate_file {
-    std::string path;
-    std::string sha256;
-    size_t size;
-    fs::path old_path;
-    fs::path etag_path;
-    const hf_file * file;
-};
-
-using migrate_files = std::vector<migrate_file>;
-
-static bool collect_file(const fs::path    & old_cache,
-                         const std::string & owner,
-                         const std::string & repo,
-                         const std::string & path,
-                         const std::string & sha256,
-                         const hf_files    & files,
-                         migrate_files     & to_migrate) {
-
-    const hf_file * file = nullptr;
-
-    for (const auto & f : files) {
-        if (f.path == path) {
-            file = &f;
-            break;
-        }
-    }
-
-    std::string old_filename = make_old_cache_filename(owner, repo, path);
-    fs::path old_path = old_cache / old_filename;
-    fs::path etag_path = old_path.string() + ".etag";
-
-    if (!fs::exists(old_path)) {
-        if (file && fs::exists(file->final_path)) {
-            return true;
-        }
-        LOG_WRN("%s: %s not found in old cache or HF cache\n", __func__, old_filename.c_str());
-        return false;
-    }
-
-    if (!file) {
-        LOG_WRN("%s: %s not found in current repo\n", __func__, old_filename.c_str());
-        return false;
-    }
-
-    if (!sha256.empty() && !file->oid.empty() && sha256 != file->oid) {
-        LOG_WRN("%s: %s is not up to date (sha256 mismatch)\n", __func__, old_filename.c_str());
-        return false;
-    }
-
-    if (file->size > 0) {
-        size_t size = fs::file_size(old_path);
-        if (size != file->size) {
-            LOG_WRN("%s: %s has wrong size %zu (expected %zu)\n", __func__, old_filename.c_str(), size, file->size);
-            return false;
-        }
-    }
-
-    to_migrate.push_back({path, sha256, file->size, old_path, etag_path, file});
-    return true;
-}
-
-static bool collect_files(const fs::path    & old_cache,
-                          const std::string & owner,
-                          const std::string & repo,
-                          const nl::json    & node,
-                          const hf_files    & files,
-                          migrate_files     & to_migrate) {
-
-    if (!node.contains("rfilename") ||
-        !node.contains("lfs")       ||
-        !node["lfs"].contains("sha256")) {
-        return true;
-    }
-
-    std::string path = node["rfilename"];
-    std::string sha256 = node["lfs"]["sha256"];
-
-    auto split = get_gguf_split_info(path);
-
-    if (split.count <= 1) {
-        return collect_file(old_cache, owner, repo, path, sha256, files, to_migrate);
-    }
-
-    std::vector<std::pair<std::string, std::string>> splits;
-
-    for (const auto & f : files) {
-        auto split_f = get_gguf_split_info(f.path);
-        if (split_f.count == split.count && split_f.prefix == split.prefix) {
-            // sadly the manifest only provides the sha256 of the first file (index == 1)
-            // the rest will be verified using the size...
-            std::string f_sha256 = (split_f.index == 1) ? sha256 : "";
-            splits.emplace_back(f.path, f_sha256);
-        }
-    }
-
-    if ((int)splits.size() != split.count) {
-        LOG_WRN("%s: expected %d split files but found %d in repo\n", __func__, split.count, (int)splits.size());
-        return false;
-    }
-
-    for (const auto & [f_path, f_sha256] : splits) {
-        if (!collect_file(old_cache, owner, repo, f_path, f_sha256, files, to_migrate)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static bool migrate_file(const migrate_file & file) {
-    std::error_code ec;
-
-    fs::path new_path(file.file->local_path);
-    fs::create_directories(new_path.parent_path(), ec);
-
-    if (!fs::exists(new_path, ec)) {
-        fs::rename(file.old_path, new_path, ec);
-        if (ec) {
-            fs::copy_file(file.old_path, new_path, ec);
-            if (ec) {
-                LOG_ERR("%s: failed to move/copy %s: %s\n", __func__, file.old_path.string().c_str(), ec.message().c_str());
-                return false;
-            }
-        }
-        fs::remove(file.old_path, ec);
-    }
-    fs::remove(file.etag_path, ec);
-
-    std::string filename = finalize_file(*file.file);
-    LOG_INF("%s: migrated %s -> %s\n", __func__, file.old_path.filename().string().c_str(), filename.c_str());
-    return true;
-}
-
-void migrate_old_cache_to_hf_cache(const std::string & token, bool offline) {
-    fs::path old_cache = fs_get_cache_directory();
-    if (!fs::exists(old_cache)) {
-        return;
-    }
-
-    if (offline) {
-        LOG_WRN("%s: skipping migration in offline mode (will run when online)\n", __func__);
-        return; // -hf is not going to work
-    }
-
-    bool warned = false;
-
-    for (const auto & entry : fs::directory_iterator(old_cache)) {
-        if (!entry.is_regular_file()) {
-            continue;
-        }
-        auto filename = entry.path().filename().string();
-        auto [owner, repo] = parse_manifest_name(filename);
-
-        if (owner.empty() || repo.empty()) {
-            continue;
-        }
-
-        if (!warned) {
-            warned = true;
-            LOG_WRN("================================================================================\n"
-                    "WARNING: Migrating cache to HuggingFace cache directory\n"
-                    "  Old cache: %s\n"
-                    "  New cache: %s\n"
-                    "This one-time migration moves models previously downloaded with -hf\n"
-                    "from the legacy llama.cpp cache to the standard HuggingFace cache.\n"
-                    "Models downloaded with --model-url are not affected.\n"
-                    "================================================================================\n",
-                    old_cache.string().c_str(), get_cache_directory().string().c_str());
-        }
-
-        auto repo_id = owner + "/" + repo;
-        auto files = get_repo_files(repo_id, token);
-
-        if (files.empty()) {
-            LOG_WRN("%s: could not get repo files for %s, skipping\n", __func__, repo_id.c_str());
-            continue;
-        }
-
-        migrate_files to_migrate;
-        bool ok = true;
-
-        try {
-            std::ifstream manifest(entry.path());
-            auto json = nl::json::parse(manifest);
-            for (const char * key : {"ggufFile", "mmprojFile"}) {
-                if (json.contains(key)) {
-                    if (!collect_files(old_cache, owner, repo, json[key], files, to_migrate)) {
-                        ok = false;
-                        break;
-                    }
-                }
-            }
-        } catch (const std::exception & e) {
-            LOG_WRN("%s: failed to parse manifest %s: %s\n", __func__, filename.c_str(), e.what());
-            continue;
-        }
-
-        if (!ok) {
-            LOG_WRN("%s: migration skipped: one or more files failed validation\n", __func__);
-            continue;
-        }
-
-        for (const auto & file : to_migrate) {
-            if (!migrate_file(file)) {
-                ok = false;
-                break;
-            }
-        }
-
-        if (!ok) {
-            LOG_WRN("%s: migration failed: could not migrate all files\n", __func__);
-            continue;
-        }
-
-        LOG_INF("%s: migration complete, deleting manifest: %s\n", __func__, entry.path().string().c_str());
-        fs::remove(entry.path());
-    }
-}
-
 } // namespace hf_cache
--- a/common/hf-cache.h
+++ b/common/hf-cache.h
@@ -14,7 +14,6 @@ struct hf_file {
    std::string final_path;
    std::string oid;
    std::string repo_id;
-    size_t size = 0; // only for the migration
 };

 using hf_files = std::vector<hf_file>;
@@ -30,7 +29,4 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);

-// TODO: Remove later
-void migrate_old_cache_to_hf_cache(const std::string & token, bool offline = false);
-
 } // namespace hf_cache
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -435,10 +435,10 @@ void common_log_flush(struct common_log * log) {
 static int common_get_verbosity(enum ggml_log_level level) {
    switch (level) {
        case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;
-        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_INFO;
+        case GGML_LOG_LEVEL_INFO:  return LOG_LEVEL_TRACE;
        case GGML_LOG_LEVEL_WARN:  return LOG_LEVEL_WARN;
        case GGML_LOG_LEVEL_ERROR: return LOG_LEVEL_ERROR;
-        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_INFO; // same as INFO
+        case GGML_LOG_LEVEL_CONT:  return LOG_LEVEL_TRACE;
        case GGML_LOG_LEVEL_NONE:
        default:
            return LOG_LEVEL_OUTPUT;
--- a/common/log.h
+++ b/common/log.h
@@ -21,7 +21,8 @@
 #    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

-#define LOG_LEVEL_DEBUG  4
+#define LOG_LEVEL_DEBUG  5
+#define LOG_LEVEL_TRACE  4
 #define LOG_LEVEL_INFO   3
 #define LOG_LEVEL_WARN   2
 #define LOG_LEVEL_ERROR  1
@@ -111,13 +112,15 @@ void common_log_flush         (struct common_log * log);                    // f
 #define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity,        __VA_ARGS__)

 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_LEVEL_DEBUG,  __VA_ARGS__)
+#define LOG_TRC(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_TRACE,  __VA_ARGS__)
 #define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  LOG_LEVEL_INFO,   __VA_ARGS__)
 #define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  LOG_LEVEL_WARN,   __VA_ARGS__)
 #define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, LOG_LEVEL_ERROR,  __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  LOG_LEVEL_INFO,   __VA_ARGS__) // same as INFO

+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
+#define LOG_TRCV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_TRACE, verbosity, __VA_ARGS__)
 #define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
 #define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
 #define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
-#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
 #define LOG_CNTV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  verbosity, __VA_ARGS__)
--- a/common/ngram-map.cpp
+++ b/common/ngram-map.cpp
@@ -471,7 +471,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        sum_occur += curr_occur;
    }

-    LOG_INF("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
+    LOG_DBG("%s: key_offset = %zu, max_occur = %d, sum_occur = %d, slot_max = %d [%zu/%d, %zu/%d, %zu/%d, %zu/%d]\n", __func__,
            key_offset,
            max_occur, sum_occur, slot_max,
            curr_key.values[0].value_idx, curr_key.values[0].value_num,
@@ -482,7 +482,7 @@ void common_ngram_map_draft(common_ngram_map & map,
    // Print the tokens of the four values (if idx != 0), use LOG_INF
    for (int v = 0; v < COMMON_NGRAM_MAX_VALUES; ++v) {
        if (curr_key.values[v].value_idx != 0) {
-            LOG_INF("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
+            LOG_DBG("%s: value[%d] = %s\n", __func__, v, common_tokens_to_str(inp, curr_key.values[v].value_idx, m).c_str());
        }
    }

@@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map,
        draft.push_back(inp[match_pos + n + i]);
    }

-    LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
+    LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__,
            key_offset, slot_max,
            curr_key.key_num, draft.size());

--- a/common/reasoning-budget.cpp
+++ b/common/reasoning-budget.cpp
@@ -171,22 +171,12 @@ static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
    ctx->force_pos = 0;
 }

-// forward declaration for use in clone
 static struct llama_sampler * common_reasoning_budget_init_state(
        const struct llama_vocab * vocab, const std::vector<llama_token> & start_tokens,
        const std::vector<llama_token> & end_tokens, const std::vector<llama_token> & forced_tokens,
        int32_t budget, common_reasoning_budget_state initial_state);

-static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
-    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
-    return common_reasoning_budget_init_state(
-        ctx->vocab,
-        ctx->start_matcher.tokens,
-        ctx->end_matcher.tokens,
-        ctx->forced_tokens,
-        ctx->budget,
-        ctx->state);
-}
+static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl);

 static void common_reasoning_budget_free(struct llama_sampler * smpl) {
    delete (common_reasoning_budget_ctx *) smpl->ctx;
@@ -205,6 +195,15 @@ static struct llama_sampler_i common_reasoning_budget_i = {
    /* .backend_set_input = */ nullptr,
 };

+static struct llama_sampler * common_reasoning_budget_clone(const struct llama_sampler * smpl) {
+    const auto * ctx = (const common_reasoning_budget_ctx *) smpl->ctx;
+
+    return llama_sampler_init(
+        /* .iface = */ &common_reasoning_budget_i,
+        /* .ctx   = */ new common_reasoning_budget_ctx(*ctx)
+    );
+}
+
 static struct llama_sampler * common_reasoning_budget_init_state(
        const struct llama_vocab             * vocab,
        const std::vector<llama_token>       & start_tokens,
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP)
 #include "log.h"
 #include "ngram-cache.h"
 #include "ngram-map.h"
@@ -23,6 +24,7 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
    {"none",          COMMON_SPECULATIVE_TYPE_NONE},
    {"draft-simple",  COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE},
    {"draft-eagle3",  COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3},
+    {"draft-mtp",     COMMON_SPECULATIVE_TYPE_DRAFT_MTP},
    {"ngram-simple",  COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
    {"ngram-map-k",   COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
    {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
@@ -30,6 +32,18 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
    {"ngram-cache",   COMMON_SPECULATIVE_TYPE_NGRAM_CACHE}
 };

+static std::string common_speculative_get_devices_str(const std::vector<ggml_backend_dev_t> & devices) {
+    std::string result;
+    for (size_t i = 0; i < devices.size(); i++) {
+        if (devices[i] == nullptr) {
+            continue;
+        }
+        if (!result.empty()) result += ", ";
+        result += ggml_backend_dev_name(devices[i]);
+    }
+    return result.empty() ? "default" : result;
+}
+
 struct common_speculative_config {
    common_speculative_type type;
    common_params_speculative params;
@@ -142,7 +156,13 @@ struct common_speculative_impl {

    virtual void draft(common_speculative_draft_params_vec & dparams) = 0;

-    virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0;
+    virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
+
+    // true if this implementation requires the target context to extract post-norm embeddings
+    virtual bool need_embd() const = 0;
+
+    // true if this implementation requires the target context to extract pre-norm embeddings
+    virtual bool need_embd_pre_norm() const { return false; }
 };

 struct common_speculative_impl_draft_simple : public common_speculative_impl {
@@ -159,6 +179,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
        auto * ctx_dft = this->params.ctx_dft;
        auto * ctx_tgt = this->params.ctx_tgt;

+        LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min);
+        LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
+                this->params.n_gpu_layers,
+                ggml_type_name(this->params.cache_type_k),
+                ggml_type_name(this->params.cache_type_v),
+                ctx_tgt ? "yes" : "no",
+                ctx_dft ? "yes" : "no",
+                common_speculative_get_devices_str(this->params.devices).c_str());
+
        batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);

        // TODO: optimize or pass from outside?
@@ -335,16 +365,24 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
        }
    }

-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
+    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
        // noop
    }
+
+    bool need_embd() const override {
+        return false;
+    }
 };

 struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
    //common_params_speculative_eagle3 params;

-    common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq)
-        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {}
+    common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
+        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
+    {
+        LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
+    }

    void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
        // noop
@@ -359,9 +397,384 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
        // TODO: implement
    }

-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
+    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
        // noop
    }
+
+    bool need_embd() const override {
+        return false;
+    }
+};
+
+struct common_speculative_impl_draft_mtp : public common_speculative_impl {
+    common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft)
+
+    llama_batch batch;
+
+    std::vector<common_sampler_ptr> smpls;
+
+    // backend sampler chain per seq, attached to ctx_dft
+    std::vector<llama_sampler *> backend_chains;
+
+    int32_t n_embd = 0;
+
+    // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1.
+    // The last h-row of one process() call needs the first token of the NEXT
+    // call to pair with, so it's stashed here until that next call fires.
+    std::vector<std::vector<float>> pending_h;   // [n_seq][n_embd]
+
+    std::vector<int32_t> i_batch_beg;
+    std::vector<int32_t> i_batch_end;
+
+    // Hidden rows from the most recent target verification batch, grouped by seq.
+    // Row 0 corresponds to the sampled token, row N to the Nth accepted draft token.
+    std::vector<std::vector<float>> verify_h;
+    std::vector<int32_t> verify_h_rows;
+
+    // Per-seq draft length from the last draft() call, used in accept() to
+    // roll back ctx_dft's recurrent state past the AR draft's redundant
+    // pre-advancement before process() mirrored the verify batch.
+    std::vector<uint16_t> last_n_drafted;
+
+    common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq)
+        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq)
+        , params(params.draft)
+    {
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+        GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set");
+
+        n_embd = llama_model_n_embd(llama_get_model(ctx_dft));
+
+        LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__);
+        LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling);
+        LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__,
+                this->params.n_gpu_layers,
+                ggml_type_name(this->params.cache_type_k),
+                ggml_type_name(this->params.cache_type_v),
+                ctx_tgt ? "yes" : "no",
+                ctx_dft ? "yes" : "no",
+                common_speculative_get_devices_str(this->params.devices).c_str());
+
+        const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
+        batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1);
+        // llama_batch_init allocates only one of token/embd; MTP needs both.
+        // TODO: fix, how to call without malloc
+        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
+
+        smpls.resize(n_seq);
+        for (auto & s : smpls) {
+            common_params_sampling sparams;
+            sparams.no_perf  = false;
+            sparams.top_k    = 10;
+            sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
+            s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
+        }
+
+        // offload draft sampling to the backend
+        backend_chains.assign(n_seq, nullptr);
+        if (this->params.backend_sampling) {
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params());
+                llama_sampler_chain_add(chain, llama_sampler_init_top_k(10));
+
+                if (!llama_set_sampler(ctx_dft, seq_id, chain)) {
+                    LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id);
+                    llama_sampler_free(chain);
+                    chain = nullptr;
+                }
+                backend_chains[seq_id] = chain;
+            }
+        }
+
+        llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false);
+        llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
+
+        pending_h.assign(n_seq, std::vector<float>(n_embd, 0.0f));
+
+        i_batch_beg.assign(n_seq, -1);
+        i_batch_end.assign(n_seq, -1);
+
+        verify_h.assign(n_seq, {});
+        verify_h_rows.assign(n_seq, 0);
+
+        last_n_drafted.assign(n_seq, 0);
+    }
+
+    ~common_speculative_impl_draft_mtp() override {
+        auto * ctx_dft = this->params.ctx_dft;
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) {
+            if (backend_chains[seq_id] == nullptr) {
+                continue;
+            }
+            if (ctx_dft) {
+                llama_set_sampler(ctx_dft, seq_id, nullptr);
+            }
+            llama_sampler_free(backend_chains[seq_id]);
+        }
+        backend_chains.clear();
+
+        if (batch.token != nullptr) {
+            free(batch.token);
+            batch.token = nullptr;
+        }
+        llama_batch_free(batch);
+    }
+
+    void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
+        const int32_t N = (int32_t) prompt.size();
+        if (N <= 0) {
+            return;
+        }
+        auto * ctx_dft = this->params.ctx_dft;
+        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+        if (pos_max < N - 1) {
+            LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - "
+                    "process() hook may not have run on every prefill ubatch "
+                    "(need_embd / logits=1 on every prompt position?). "
+                    "Drafts may degrade.\n",
+                    __func__, (int) pos_max, N - 1);
+        }
+    }
+
+    bool process(const llama_batch & batch_in) override {
+        if (batch_in.n_tokens <= 0) {
+            return true;
+        }
+
+        // TODO: how to make it work with vision tokens?
+        if (batch_in.token == nullptr || batch_in.embd != nullptr) {
+            return true;
+        }
+
+        const int32_t n_tokens = batch_in.n_tokens;
+
+        // remember the frist and last batch index for each sequence
+        std::fill(i_batch_beg.begin(), i_batch_beg.end(), -1);
+        std::fill(i_batch_end.begin(), i_batch_end.end(), -1);
+
+        for (int k = 0; k < n_tokens; ++k) {
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                GGML_ASSERT(batch_in.n_seq_id[k] == 1);
+
+                if (batch_in.seq_id[k][0] == seq_id) {
+                    i_batch_end[seq_id] = k;
+                    if (i_batch_beg[seq_id] < 0) {
+                        i_batch_beg[seq_id] = k;
+                    }
+                }
+            }
+        }
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+
+        const size_t row_bytes = (size_t) n_embd * sizeof(float);
+
+        common_batch_clear(batch);
+
+        for (int k = 0; k < n_tokens; ++k) {
+            common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0);
+        }
+
+        // shift the tgt embeddings to the right by one position
+        // assumes that the tokens in the batch are sequential for each sequence
+        // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1]
+        //                                                       ^--- this is a problem
+        // TODO:this is generally true, but would be nice to assert it
+        {
+            const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt);
+            std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1));
+
+            //{
+            //    // string with seq_ids in the batch
+            //    std::stringstream ss;
+            //    for (int i = 0; i < n_tokens; ++i) {
+            //        ss << batch_in.seq_id[i][0] << ",";
+            //    }
+            //    LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str());
+            //}
+        }
+
+        // fill the pending embeddings from a previous run
+        auto set_h = [&](int idx, const float * h_row) {
+            std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes);
+        };
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            if (i_batch_beg[seq_id] < 0) {
+                continue;
+            }
+
+            set_h(i_batch_beg[seq_id], pending_h[seq_id].data());
+        }
+
+        const int32_t rc = llama_decode(ctx_dft, batch);
+        if (rc != 0) {
+            LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]);
+            return false;
+        }
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            if (i_batch_end[seq_id] < 0) {
+                continue;
+            }
+
+            const int32_t n_rows = i_batch_end[seq_id] - i_batch_beg[seq_id] + 1;
+            verify_h_rows[seq_id] = n_rows;
+            verify_h[seq_id].resize((size_t) n_rows * n_embd);
+
+            for (int32_t i = 0; i < n_rows; ++i) {
+                const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i);
+                std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes);
+            }
+
+            std::memcpy(pending_h[seq_id].data(),
+                    verify_h[seq_id].data() + (size_t) (n_rows - 1) * n_embd, row_bytes);
+        }
+
+        return true;
+    }
+
+    void draft(common_speculative_draft_params_vec & dparams) override {
+        auto & ctx_dft = params.ctx_dft;
+
+        common_batch_clear(batch);
+
+        // keep track of which sequences are still drafting
+        int n_drafting = 0;
+        std::vector<bool> drafting(n_seq);
+
+        const float * h_row = nullptr;
+        const size_t row_bytes = (size_t) n_embd * sizeof(float);
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+
+            if (!dp.drafting) {
+                continue;
+            }
+
+            n_drafting++;
+            drafting[seq_id] = true;
+            common_sampler_reset(smpls[seq_id].get());
+
+            common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true);
+
+            h_row = pending_h[seq_id].data();
+            std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
+        }
+
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0) {
+            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            return;
+        }
+
+        int i = 0;
+
+        while (n_drafting > 0) {
+            int i_batch = 0;
+
+            common_batch_clear(batch);
+
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                if (!drafting[seq_id]) {
+                    continue;
+                }
+
+                auto * smpl = smpls[seq_id].get();
+
+                common_sampler_sample(smpl, ctx_dft, i_batch, true);
+                h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch);
+                ++i_batch;
+
+                const auto * cur_p = common_sampler_get_candidates(smpl, true);
+
+                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
+                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                }
+
+                // add drafted token for each sequence
+                const llama_token id = cur_p->data[0].id;
+
+                // only collect very high-confidence draft tokens
+                if (cur_p->data[0].p < params.p_min) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+
+                    continue;
+                }
+
+                common_sampler_accept(smpl, id, true);
+
+                auto & dp = dparams.at(seq_id);
+                auto & result = *dp.result;
+
+                result.push_back(id);
+
+                if (params.n_max <= (int) result.size()) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+                    continue;
+                }
+
+                common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true);
+                std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes);
+            }
+
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            // evaluate the drafted tokens on the draft model
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }
+
+            ++i;
+        }
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+            if (!dp.drafting) {
+                continue;
+            }
+
+            if (dp.result->size() < (size_t) params.n_min) {
+                dp.result->clear();
+            }
+
+            last_n_drafted[seq_id] = (uint16_t) dp.result->size();
+        }
+    }
+
+    void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+            return;
+        }
+
+        const int32_t n_rows = verify_h_rows[seq_id];
+        if (n_rows <= 0) {
+            return;
+        }
+
+        const int32_t i_h = std::min<int32_t>(n_accepted, n_rows - 1);
+        const size_t row_bytes = (size_t) n_embd * sizeof(float);
+        std::memcpy(pending_h[seq_id].data(), verify_h[seq_id].data() + (size_t) i_h * n_embd, row_bytes);
+    }
+
+    bool need_embd() const override {
+        return false;
+    }
+
+    bool need_embd_pre_norm() const override {
+        return true;
+    }
 };

 // state of self-speculation (simple implementation, not ngram-map)
@@ -376,7 +789,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
            common_ngram_simple_config config)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq)
        , params(params.ngram_simple)
-        , config(config) {}
+        , config(config)
+    {
+        LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__);
+        LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__,
+                this->params.size_n, this->params.size_m, this->params.min_hits);
+    }

    void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
        // noop
@@ -400,26 +818,31 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl {
        }
    }

-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
+    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
        // noop
    }
+
+    bool need_embd() const override {
+        return false;
+    }
 };

 struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
-    common_params_speculative_ngram_map params;
-
    // n_seq configs
    std::vector<common_ngram_map> config;

    common_speculative_impl_ngram_map_k(
-            const common_params_speculative & params,
            const common_ngram_map & config,
            uint32_t n_seq)
        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq)
-        , params(params.ngram_map_k) {
+    {
        for (uint32_t i = 0; i < n_seq; i++) {
            this->config.push_back(config);
        }
+
+        LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str());
+        LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__,
+                config.size_key, config.size_value, config.key_only, config.min_hits);
    }

    void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
@@ -446,11 +869,19 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl {
        }
    }

-    void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
+    void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override {
        GGML_ASSERT((seq_id < (llama_seq_id) config.size()));

+        if (is_other) {
+            return;
+        }
+
        common_ngram_map_accept(config[seq_id], n_accepted);
    }
+
+    bool need_embd() const override {
+        return false;
+    }
 };

 struct common_speculative_impl_ngram_mod : public common_speculative_impl {
@@ -466,7 +897,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
        // the last position in the prompt that was added to the ngram container
        size_t i_last = 0;

-        // length of the last drafted n‑gram (number of tokens returned by draft)
+        // length of the last drafted n-gram (number of tokens returned by draft)
        size_t n_draft_last = 0;

        // consecutive accept rounds with low acceptance fraction (< 0.5)
@@ -484,8 +915,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
        , verbose(std::getenv("LLAMA_TRACE") != nullptr) {
        static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t));

-        LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__,
-                this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024);
+        LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__);
+        LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__,
+                this->params.n_match, this->params.n_max, this->params.n_min);
+        LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__,
+                mod.size(), (float)(mod.size_bytes())/1024/1024);

        if (this->params.n_match < 16) {
            LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
@@ -575,7 +1009,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
        }
        result.resize(result.size() - n);

-        // store length of drafted n‑gram for later acceptance analysis
+        // store length of drafted n-gram for later acceptance analysis
        sinfo.n_draft_last = result.size();
    }

@@ -597,17 +1031,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
        }
    }

-    void accept(llama_seq_id seq_id, uint16_t n_accepted) override {
+    void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override {
+        if (is_other) {
+            return;
+        }
+
        auto & sinfo = sinfos[seq_id];

        // compute acceptance fraction if we have a recorded draft length
        if (sinfo.n_draft_last > 0) {
            const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last;
-            if (f_acc < 0.5) {
+            if (f_acc < 0.25) {
                sinfo.n_low++;
-                if (sinfo.n_low >= 3) {
+                if (sinfo.n_low >= 5) {
                    if (verbose) {
-                        LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low);
+                        LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low);
                    }

                    mod.reset();
@@ -619,6 +1057,10 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl {
            }
        }
    }
+
+    bool need_embd() const override {
+        return false;
+    }
 };

 struct common_speculative_impl_ngram_cache : public common_speculative_impl {
@@ -653,6 +1095,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
        , save_dynamic(save_dynamic)
        , save_static(save_static)
    {
+        LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__);
+        LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__,
+                n_draft,
+                path_static.empty() ? "none" : path_static.c_str(),
+                path_dynamic.empty() ? "none" : path_dynamic.c_str());
+
        sinfos.resize(n_seq);

        if (!path_static.empty()) {
@@ -749,9 +1197,13 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl {
        }
    }

-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override {
+    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
        // noop
    }
+
+    bool need_embd() const override {
+        return false;
+    }
 };

 struct common_speculative {
@@ -820,6 +1272,7 @@ std::string common_speculative_type_to_str(common_speculative_type type) {
        case COMMON_SPECULATIVE_TYPE_NONE:          return "none";
        case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE:  return "draft-simple";
        case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:  return "draft-eagle3";
+        case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:     return "draft-mtp";
        case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:  return "ngram-simple";
        case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:   return "ngram-map-k";
        case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v";
@@ -875,8 +1328,8 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        bool has_draft_model_path = !params.draft.mparams.path.empty();

        bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
-        // bool has_mtp = false; // TODO: add MTP here
        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
+        bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;

        bool has_ngram_cache   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
        bool has_ngram_simple  = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
@@ -885,7 +1338,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        bool has_ngram_mod     = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MOD));

        // when adding a new type - update here the logic above
-        static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 8);
+        static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 9);

        // this list here defines the priority of the speculators
        // the one with highest priority are listed first
@@ -911,7 +1364,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
                LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__);
                has_draft_simple = false;
            }
-        } else if (has_draft_model_path) {
+        } else if (has_draft_model_path && !has_mtp && !has_draft_eagle3) {
            LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__);
            has_draft_simple = true;
        }
@@ -919,16 +1372,17 @@ common_speculative * common_speculative_init(common_params_speculative & params,
        if (has_draft_simple) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params));
        }
-        // TODO: add MTP here
        if (has_draft_eagle3) {
            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params));
        }
+        if (has_mtp) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params));
+        }
    }

    std::vector<std::unique_ptr<common_speculative_impl>> impls = {};

    for (const common_speculative_config & config : configs) {
-        LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str());
        switch (config.type) {
            case COMMON_SPECULATIVE_TYPE_NONE:
                break;
@@ -940,6 +1394,10 @@ common_speculative * common_speculative_init(common_params_speculative & params,
                impls.push_back(std::make_unique<common_speculative_impl_draft_eagle3>(config.params, n_seq));
                break;
            }
+            case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: {
+                impls.push_back(std::make_unique<common_speculative_impl_draft_mtp>(config.params, n_seq));
+                break;
+            }
            case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
                common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple);

@@ -958,11 +1416,16 @@ common_speculative * common_speculative_init(common_params_speculative & params,
                impls.push_back(std::move(state));
                break;
            }
-            case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:
+            case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: {
+                impls.push_back(
+                        std::make_unique<common_speculative_impl_ngram_map_k>(
+                            get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
+                break;
+            }
            case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
                impls.push_back(
                        std::make_unique<common_speculative_impl_ngram_map_k>(
-                            config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq));
+                            get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq));
                break;
            }
            case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
@@ -984,7 +1447,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
    }

    if (impls.empty()) {
-        LOG_WRN("%s", "no implementations specified for speculative decoding\n");
+        LOG_WRN("%s: no implementations specified for speculative decoding\n", __func__);
        return nullptr;
    }

@@ -1040,6 +1503,34 @@ bool common_speculative_process(common_speculative * spec, const llama_batch & b
    return result;
 }

+bool common_speculative_need_embd(common_speculative * spec) {
+    if (spec == nullptr) {
+        return false;
+    }
+
+    for (auto & impl : spec->impls) {
+        if (impl->need_embd()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+bool common_speculative_need_embd_pre_norm(common_speculative * spec) {
+    if (spec == nullptr) {
+        return false;
+    }
+
+    for (auto & impl : spec->impls) {
+        if (impl->need_embd_pre_norm()) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 void common_speculative_draft(common_speculative * spec) {
    if (spec == nullptr) {
        return;
@@ -1122,10 +1613,6 @@ void common_speculative_draft(common_speculative * spec) {
 }

 void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, uint16_t n_accepted) {
-    if (n_accepted == 0) {
-        return;
-    }
-
    common_speculative_impl * impl = spec->impl_last[seq_id];

    GGML_ASSERT(impl);
@@ -1137,9 +1624,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
            impl->n_acc_tokens += n_accepted;
        }

-        impl->accept(seq_id, n_accepted);
+        impl->accept(seq_id, n_accepted, false);
        impl->n_call_accept++;
    }
+
+    // accept with the rest of the implementations, using is_other == true
+    for (auto & impl_other : spec->impls) {
+        if (impl_other.get() != impl) {
+            impl_other->accept(seq_id, n_accepted, true);
+        }
+    }
 }

 void common_speculative_print_stats(const common_speculative * spec) {
@@ -1159,7 +1653,7 @@ void common_speculative_print_stats(const common_speculative * spec) {
            str_perf = "";
        }

-        LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n",
+        LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n",
                common_speculative_type_to_str(impl->type).c_str(),
                impl->n_call_begin, impl->n_call_draft, impl->n_call_accept,
                impl->n_gen_drafts,
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -53,6 +53,12 @@ void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, co
 // process the batch and update the internal state of the speculative context
 bool common_speculative_process(common_speculative * spec, const llama_batch & batch);

+// true if any implementation requires target post-norm embeddings to be extracted
+bool common_speculative_need_embd(common_speculative * spec);
+
+// true if any implementation requires target pre-norm embeddings to be extracted
+bool common_speculative_need_embd_pre_norm(common_speculative * spec);
+
 // generate drafts for the sequences specified with `common_speculative_get_draft_params`
 void common_speculative_draft(common_speculative * spec);

--- a/conversion/init.py
+++ b/conversion/init.py
@@ -0,0 +1,335 @@
+from __future__ import annotations
+
+from .base import (
+    ModelBase, TextModel, MmprojModel, ModelType, SentencePieceTokenTypes,
+    logger, _mistral_common_installed, _mistral_import_error_msg,
+    get_model_architecture, LazyTorchTensor,
+)
+from typing import Type
+
+
+__all__ = [
+    "ModelBase", "TextModel", "MmprojModel", "ModelType", "SentencePieceTokenTypes",
+    "get_model_architecture", "LazyTorchTensor", "logger",
+    "_mistral_common_installed", "_mistral_import_error_msg",
+    "get_model_class", "print_registered_models", "load_all_models",
+]
+
+
+TEXT_MODEL_MAP: dict[str, str] = {
+    "AfmoeForCausalLM": "afmoe",
+    "ApertusForCausalLM": "llama",
+    "ArceeForCausalLM": "llama",
+    "ArcticForCausalLM": "arctic",
+    "AudioFlamingo3ForConditionalGeneration": "qwen",
+    "BaiChuanForCausalLM": "baichuan",
+    "BaichuanForCausalLM": "baichuan",
+    "BailingMoeForCausalLM": "bailingmoe",
+    "BailingMoeV2ForCausalLM": "bailingmoe",
+    "BambaForCausalLM": "granite",
+    "BertForMaskedLM": "bert",
+    "BertForSequenceClassification": "bert",
+    "BertModel": "bert",
+    "BitnetForCausalLM": "bitnet",
+    "BloomForCausalLM": "bloom",
+    "BloomModel": "bloom",
+    "CamembertModel": "bert",
+    "ChameleonForCausalLM": "chameleon",
+    "ChameleonForConditionalGeneration": "chameleon",
+    "ChatGLMForConditionalGeneration": "chatglm",
+    "ChatGLMModel": "chatglm",
+    "CodeShellForCausalLM": "codeshell",
+    "CogVLMForCausalLM": "cogvlm",
+    "Cohere2ForCausalLM": "command_r",
+    "CohereForCausalLM": "command_r",
+    "DbrxForCausalLM": "dbrx",
+    "DeciLMForCausalLM": "deci",
+    "DeepseekForCausalLM": "deepseek",
+    "DeepseekV2ForCausalLM": "deepseek",
+    "DeepseekV3ForCausalLM": "deepseek",
+    "DistilBertForMaskedLM": "bert",
+    "DistilBertForSequenceClassification": "bert",
+    "DistilBertModel": "bert",
+    "Dots1ForCausalLM": "dots1",
+    "DotsOCRForCausalLM": "qwen",
+    "DreamModel": "dream",
+    "Ernie4_5ForCausalLM": "ernie",
+    "Ernie4_5_ForCausalLM": "ernie",
+    "Ernie4_5_MoeForCausalLM": "ernie",
+    "EuroBertModel": "bert",
+    "Exaone4ForCausalLM": "exaone",
+    "ExaoneForCausalLM": "exaone",
+    "ExaoneMoEForCausalLM": "exaone",
+    "FalconForCausalLM": "falcon",
+    "FalconH1ForCausalLM": "falcon_h1",
+    "FalconMambaForCausalLM": "mamba",
+    "GPT2LMHeadModel": "gpt2",
+    "GPTBigCodeForCausalLM": "starcoder",
+    "GPTNeoXForCausalLM": "gptneox",
+    "GPTRefactForCausalLM": "refact",
+    "Gemma2ForCausalLM": "gemma",
+    "Gemma3ForCausalLM": "gemma",
+    "Gemma3ForConditionalGeneration": "gemma",
+    "Gemma3TextModel": "gemma",
+    "Gemma3nForCausalLM": "gemma",
+    "Gemma3nForConditionalGeneration": "gemma",
+    "Gemma4ForConditionalGeneration": "gemma",
+    "Gemma4ForCausalLM": "gemma",
+    "GemmaForCausalLM": "gemma",
+    "Glm4ForCausalLM": "glm",
+    "Glm4MoeForCausalLM": "glm",
+    "Glm4MoeLiteForCausalLM": "glm",
+    "Glm4vForConditionalGeneration": "glm",
+    "Glm4vMoeForConditionalGeneration": "glm",
+    "GlmForCausalLM": "chatglm",
+    "GlmMoeDsaForCausalLM": "glm",
+    "GlmOcrForConditionalGeneration": "glm",
+    "GptOssForCausalLM": "gpt_oss",
+    "GraniteForCausalLM": "granite",
+    "GraniteMoeForCausalLM": "granite",
+    "GraniteMoeHybridForCausalLM": "granite",
+    "GraniteMoeSharedForCausalLM": "granite",
+    "GraniteSpeechForConditionalGeneration": "granite",
+    "Grok1ForCausalLM": "grok",
+    "GrokForCausalLM": "grok",
+    "GroveMoeForCausalLM": "grovemoe",
+    "HunYuanDenseV1ForCausalLM": "hunyuan",
+    "HunYuanMoEV1ForCausalLM": "hunyuan",
+    "HunYuanVLForConditionalGeneration": "hunyuan",
+    "IQuestCoderForCausalLM": "llama",
+    "InternLM2ForCausalLM": "internlm",
+    "InternLM3ForCausalLM": "internlm",
+    "JAISLMHeadModel": "jais",
+    "Jais2ForCausalLM": "jais",
+    "JambaForCausalLM": "jamba",
+    "JanusForConditionalGeneration": "januspro",
+    "JinaBertForMaskedLM": "bert",
+    "JinaBertModel": "bert",
+    "JinaEmbeddingsV5Model": "bert",
+    "KORMoForCausalLM": "qwen",
+    "KimiK25ForConditionalGeneration": "deepseek",
+    "KimiLinearForCausalLM": "kimi_linear",
+    "KimiLinearModel": "kimi_linear",
+    "KimiVLForConditionalGeneration": "deepseek",
+    "LFM2ForCausalLM": "lfm2",
+    "LLaDAMoEModel": "llada",
+    "LLaDAMoEModelLM": "llada",
+    "LLaDAModelLM": "llada",
+    "LLaMAForCausalLM": "llama",
+    "Lfm25AudioTokenizer": "lfm2",
+    "Lfm2ForCausalLM": "lfm2",
+    "Lfm2Model": "lfm2",
+    "Lfm2MoeForCausalLM": "lfm2",
+    "Llama4ForCausalLM": "llama",
+    "Llama4ForConditionalGeneration": "llama",
+    "LlamaBidirectionalModel": "llama",
+    "LlamaForCausalLM": "llama",
+    "LlamaModel": "llama",
+    "LlavaForConditionalGeneration": "llama",
+    "LlavaStableLMEpochForCausalLM": "stablelm",
+    "MPTForCausalLM": "mpt",
+    "MT5ForConditionalGeneration": "t5",
+    "MaincoderForCausalLM": "maincoder",
+    "Mamba2ForCausalLM": "mamba",
+    "MambaForCausalLM": "mamba",
+    "MambaLMHeadModel": "mamba",
+    "MiMoV2FlashForCausalLM": "mimo",
+    "MiMoV2ForCausalLM": "mimo",
+    "MiniCPM3ForCausalLM": "minicpm",
+    "MiniCPMForCausalLM": "minicpm",
+    "MiniCPMV4_6ForConditionalGeneration": "minicpm",
+    "MiniMaxM2ForCausalLM": "minimax",
+    "Ministral3ForCausalLM": "mistral3",
+    "Mistral3ForConditionalGeneration": "mistral3",
+    "MistralForCausalLM": "llama",
+    "MixtralForCausalLM": "llama",
+    "ModernBertForMaskedLM": "bert",
+    "ModernBertForSequenceClassification": "bert",
+    "ModernBertModel": "bert",
+    "NemotronForCausalLM": "nemotron",
+    "NemotronHForCausalLM": "nemotron",
+    "NeoBERT": "bert",
+    "NeoBERTForSequenceClassification": "bert",
+    "NeoBERTLMHead": "bert",
+    "NomicBertModel": "bert",
+    "OLMoForCausalLM": "olmo",
+    "Olmo2ForCausalLM": "olmo",
+    "Olmo3ForCausalLM": "olmo",
+    "OlmoForCausalLM": "olmo",
+    "OlmoeForCausalLM": "olmo",
+    "OpenELMForCausalLM": "openelm",
+    "OrionForCausalLM": "orion",
+    "PLMForCausalLM": "plm",
+    "PLaMo2ForCausalLM": "plamo",
+    "PLaMo3ForCausalLM": "plamo",
+    "PaddleOCRVLForConditionalGeneration": "ernie",
+    "PanguEmbeddedForCausalLM": "pangu",
+    "Phi3ForCausalLM": "phi",
+    "Phi4ForCausalLMV": "phi",
+    "PhiForCausalLM": "phi",
+    "PhiMoEForCausalLM": "phi",
+    "Plamo2ForCausalLM": "plamo",
+    "Plamo3ForCausalLM": "plamo",
+    "PlamoForCausalLM": "plamo",
+    "QWenLMHeadModel": "qwen",
+    "Qwen2AudioForConditionalGeneration": "qwen",
+    "Qwen2ForCausalLM": "qwen",
+    "Qwen2Model": "qwen",
+    "Qwen2MoeForCausalLM": "qwen",
+    "Qwen2VLForConditionalGeneration": "qwenvl",
+    "Qwen2VLModel": "qwenvl",
+    "Qwen2_5OmniModel": "qwenvl",
+    "Qwen2_5_VLForConditionalGeneration": "qwenvl",
+    "Qwen3ASRForConditionalGeneration": "qwen3vl",
+    "Qwen3ForCausalLM": "qwen",
+    "Qwen3Model": "qwen",
+    "Qwen3MoeForCausalLM": "qwen",
+    "Qwen3NextForCausalLM": "qwen",
+    "Qwen3OmniMoeForConditionalGeneration": "qwen3vl",
+    "Qwen3VLForConditionalGeneration": "qwen3vl",
+    "Qwen3VLMoeForConditionalGeneration": "qwen3vl",
+    "Qwen3_5ForCausalLM": "qwen",
+    "Qwen3_5ForConditionalGeneration": "qwen",
+    "Qwen3_5MoeForCausalLM": "qwen",
+    "Qwen3_5MoeForConditionalGeneration": "qwen",
+    "RND1": "qwen",
+    "RWForCausalLM": "falcon",
+    "RWKV6Qwen2ForCausalLM": "rwkv",
+    "RWKV7ForCausalLM": "rwkv",
+    "RobertaForSequenceClassification": "bert",
+    "RobertaModel": "bert",
+    "RuGPT3XLForCausalLM": "gpt2",
+    "Rwkv6ForCausalLM": "rwkv",
+    "Rwkv7ForCausalLM": "rwkv",
+    "RwkvHybridForCausalLM": "rwkv",
+    "Sarashina2VisionForCausalLM": "sarashina2",
+    "SarvamMoEForCausalLM": "bailingmoe",
+    "SeedOssForCausalLM": "olmo",
+    "SmallThinkerForCausalLM": "smallthinker",
+    "SmolLM3ForCausalLM": "llama",
+    "SolarOpenForCausalLM": "glm",
+    "StableLMEpochForCausalLM": "stablelm",
+    "StableLmForCausalLM": "stablelm",
+    "Starcoder2ForCausalLM": "starcoder",
+    "Step3p5ForCausalLM": "step3",
+    "StepVLForConditionalGeneration": "step3",
+    "T5EncoderModel": "t5",
+    "T5ForConditionalGeneration": "t5",
+    "T5WithLMHeadModel": "t5",
+    "TalkieForCausalLM": "talkie",
+    "UMT5ForConditionalGeneration": "t5",
+    "UMT5Model": "t5",
+    "UltravoxModel": "ultravox",
+    "VLlama3ForCausalLM": "llama",
+    "VoxtralForConditionalGeneration": "llama",
+    "WavTokenizerDec": "wavtokenizer",
+    "XLMRobertaForSequenceClassification": "bert",
+    "XLMRobertaModel": "bert",
+    "XverseForCausalLM": "xverse",
+    "YoutuForCausalLM": "deepseek",
+    "YoutuVLForConditionalGeneration": "deepseek",
+    "modeling_grove_moe.GroveMoeForCausalLM": "grovemoe",
+    "modeling_sarvam_moe.SarvamMoEForCausalLM": "bailingmoe",
+}
+
+
+MMPROJ_MODEL_MAP: dict[str, str] = {
+    "AudioFlamingo3ForConditionalGeneration": "ultravox",
+    "CogVLMForCausalLM": "cogvlm",
+    "DeepseekOCRForCausalLM": "deepseek",
+    "DotsOCRForCausalLM": "dotsocr",
+    "Gemma3ForConditionalGeneration": "gemma",
+    "Gemma3nForConditionalGeneration": "gemma",
+    "Gemma4ForConditionalGeneration": "gemma",
+    "Glm4vForConditionalGeneration": "qwen3vl",
+    "Glm4vMoeForConditionalGeneration": "qwen3vl",
+    "GlmOcrForConditionalGeneration": "qwen3vl",
+    "GlmasrModel": "ultravox",
+    "GraniteSpeechForConditionalGeneration": "granite",
+    "HunYuanVLForConditionalGeneration": "hunyuan",
+    "Idefics3ForConditionalGeneration": "smolvlm",
+    "InternVisionModel": "internvl",
+    "JanusForConditionalGeneration": "januspro",
+    "KimiK25ForConditionalGeneration": "kimivl",
+    "KimiVLForConditionalGeneration": "kimivl",
+    "Lfm2AudioForConditionalGeneration": "lfm2",
+    "Lfm2VlForConditionalGeneration": "lfm2",
+    "LightOnOCRForConditionalGeneration": "lighton_ocr",
+    "Llama4ForConditionalGeneration": "llama4",
+    "LlavaForConditionalGeneration": "llava",
+    "MERaLiON2ForConditionalGeneration": "ultravox",
+    "MiMoV2ForCausalLM": "mimo",
+    "MiniCPMV4_6ForConditionalGeneration": "minicpm",
+    "Mistral3ForConditionalGeneration": "llava",
+    "NemotronH_Nano_VL_V2": "nemotron",
+    "PaddleOCRVisionModel": "ernie",
+    "Phi4ForCausalLMV": "phi",
+    "Qwen2AudioForConditionalGeneration": "ultravox",
+    "Qwen2VLForConditionalGeneration": "qwenvl",
+    "Qwen2VLModel": "qwenvl",
+    "Qwen2_5OmniModel": "qwenvl",
+    "Qwen2_5_VLForConditionalGeneration": "qwenvl",
+    "Qwen3ASRForConditionalGeneration": "qwen3vl",
+    "Qwen3OmniMoeForConditionalGeneration": "qwen3vl",
+    "Qwen3VLForConditionalGeneration": "qwen3vl",
+    "Qwen3VLMoeForConditionalGeneration": "qwen3vl",
+    "Qwen3_5ForConditionalGeneration": "qwen3vl",
+    "Qwen3_5MoeForConditionalGeneration": "qwen3vl",
+    "RADIOModel": "nemotron",
+    "Sarashina2VisionForCausalLM": "sarashina2",
+    "SmolVLMForConditionalGeneration": "smolvlm",
+    "StepVLForConditionalGeneration": "step3",
+    "UltravoxModel": "ultravox",
+    "VoxtralForConditionalGeneration": "ultravox",
+    "YoutuVLForConditionalGeneration": "youtuvl",
+}
+
+
+_TEXT_MODEL_MODULES = sorted(set(TEXT_MODEL_MAP.values()))
+_MMPROJ_MODEL_MODULES = sorted(set(MMPROJ_MODEL_MAP.values()))
+
+
+_loaded_text_modules: set[str] = set()
+_loaded_mmproj_modules: set[str] = set()
+
+
+def load_all_models() -> None:
+    """Import all model modules to trigger @ModelBase.register() decorators."""
+    if len(_loaded_text_modules) != len(_TEXT_MODEL_MODULES):
+        for module_name in _TEXT_MODEL_MODULES:
+            if module_name not in _loaded_text_modules:
+                try:
+                    __import__(f"conversion.{module_name}")
+                    _loaded_text_modules.add(module_name)
+                except Exception as e:
+                    logger.warning(f"Failed to load model module {module_name}: {e}")
+
+    if len(_loaded_mmproj_modules) != len(_MMPROJ_MODEL_MODULES):
+        for module_name in _MMPROJ_MODEL_MODULES:
+            if module_name not in _loaded_mmproj_modules:
+                try:
+                    __import__(f"conversion.{module_name}")
+                    _loaded_mmproj_modules.add(module_name)
+                except Exception as e:
+                    logger.warning(f"Failed to load model module {module_name}: {e}")
+
+
+def get_model_class(name: str, mmproj: bool = False) -> Type[ModelBase]:
+    """Dynamically import and return a model class by its HuggingFace architecture name."""
+    relevant_map = MMPROJ_MODEL_MAP if mmproj else TEXT_MODEL_MAP
+    if name not in relevant_map:
+        raise NotImplementedError(f"Architecture {name!r} not supported!")
+    module_name = relevant_map[name]
+    __import__(f"conversion.{module_name}")
+    model_type = ModelType.MMPROJ if mmproj else ModelType.TEXT
+    return ModelBase._model_classes[model_type][name]
+
+
+def print_registered_models() -> None:
+    load_all_models()
+    logger.error("TEXT models:")
+    for name in sorted(TEXT_MODEL_MAP.keys()):
+        logger.error(f"  - {name}")
+    logger.error("MMPROJ models:")
+    for name in sorted(MMPROJ_MODEL_MAP.keys()):
+        logger.error(f"  - {name}")
--- a/conversion/afmoe.py
+++ b/conversion/afmoe.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from typing import Callable, Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, gguf
+
+from .llama import LlamaModel
+
+
+@ModelBase.register("AfmoeForCausalLM")
+class AfmoeModel(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.AFMOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # MoE parameters
+        if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
+            self.gguf_writer.add_expert_shared_count(n_shared_experts)
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+        if (n_dense_layers := self.hparams.get("num_dense_layers")) is not None:
+            self.gguf_writer.add_leading_dense_block_count(n_dense_layers)
+
+        # Route normalization and scaling
+        if (route_norm := self.hparams.get("route_norm")) is not None:
+            self.gguf_writer.add_expert_weights_norm(route_norm)
+        if (route_scale := self.hparams.get("route_scale")) is not None:
+            self.gguf_writer.add_expert_weights_scale(route_scale)
+
+        # Sliding window attention
+        if (sliding_window := self.hparams.get("sliding_window")) is not None:
+            self.gguf_writer.add_sliding_window(sliding_window)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+
+        if name.endswith(".expert_bias"):
+            name = name.replace(".expert_bias", ".expert_bias.bias")
+
+        return super().filter_tensors((name, gen))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Handle expert weights - they're already merged in the HF format
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                for w_name in ["gate_proj", "up_proj", "down_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename_to_retrieve])
+                        del self._experts[bid][ename_to_retrieve]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+                    yield from ModelBase.modify_tensors(self, data_torch, merged_name, bid)
+
+                return
+            else:
+                return
+
+        yield from ModelBase.modify_tensors(self, data_torch, name, bid)
--- a/conversion/arctic.py
+++ b/conversion/arctic.py
@@ -0,0 +1,162 @@
+from __future__ import annotations
+
+import json
+import sys
+
+from typing import Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, SentencePieceTokenTypes, TextModel, gguf, logger
+
+from .llama import LlamaModel
+
+
+@ModelBase.register("ArcticForCausalLM")
+class ArcticModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.ARCTIC
+
+    def set_vocab(self):
+        # The reason for using a custom implementation here is that the
+        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
+        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        if not tokenizer_path.is_file():
+            logger.error(f'Error: Missing {tokenizer_path}')
+            sys.exit(1)
+
+        # Read the whole vocabulary from the tokenizer.model file
+        tokenizer = SentencePieceProcessor()
+        tokenizer.LoadFromFile(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+
+            piece = tokenizer.IdToPiece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.GetScore(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.IsUnknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.IsControl(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.IsUnused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.IsByte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        # Use the added_tokens_decoder field from tokeniser_config.json as the source
+        # of information about added/redefined tokens and modify them accordingly.
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+
+                if "added_tokens_decoder" in tokenizer_config_json:
+                    added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
+                    for token_id, token_json in added_tokens_decoder.items():
+                        token_id = int(token_id)
+                        if token_id >= vocab_size:
+                            logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                            continue
+
+                        token_content = token_json["content"]
+                        token_type = SentencePieceTokenTypes.USER_DEFINED
+                        token_score = -10000.0
+
+                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
+                        # Set the score to 0.0 as in the original tokenizer.model
+                        if ("special" in token_json) and token_json["special"]:
+                            if token_content == tokenizer_config_json["unk_token"]:
+                                token_type = SentencePieceTokenTypes.UNKNOWN
+                            else:
+                                token_type = SentencePieceTokenTypes.CONTROL
+                            token_score = 0.0
+
+                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
+                        tokens[token_id] = token_content.encode("utf-8")
+                        toktypes[token_id] = token_type
+                        scores[token_id] = token_score
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith("q_proj.weight"):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith("k_proj.weight"):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+        # process the experts separately
+        if name.find("block_sparse_moe.experts") != -1:
+            n_experts = self.hparams["num_local_experts"]
+
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                for wid in ["w1", "w2", "w3"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, bid)
+                return
+            else:
+                return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
--- a/conversion/baichuan.py
+++ b/conversion/baichuan.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from typing import Iterable, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, TextModel, gguf, logger
+
+
+@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
+class BaichuanModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.BAICHUAN
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight":
+            logger.info(f"Unpacking and permuting layer {bid}")
+            yield from [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid),
+                    self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid),
+                    self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid),
+                    self._reverse_hf_part(data_torch, 2)),
+            ]
+        else:
+            yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid)
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+    def _reverse_hf_permute_part(
+        self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
+    ) -> Tensor:
+        r = weights.shape[0] // 3
+        return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
+
+    def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
+        r = weights.shape[0] // 3
+        return weights[r * n_part:r * n_part + r, ...]
--- a/conversion/bailingmoe.py
+++ b/conversion/bailingmoe.py
@@ -0,0 +1,216 @@
+from __future__ import annotations
+
+from typing import Callable, Iterable, TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+from .base import ModelBase, TextModel, gguf
+
+
+@ModelBase.register("BailingMoeForCausalLM")
+class BailingMoeModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_weights_scale(1.0)
+        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
+        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        n_embd = self.hparams["hidden_size"]
+        if (head_dim := self.hparams.get("head_dim")) is None:
+            head_dim = n_embd // n_head
+
+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
+
+        if name.endswith("attention.dense.weight"):
+            yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid)
+            return
+        elif name.endswith("query_key_value.weight"):
+            q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
+
+            yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid)
+            yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid)
+            yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
+            return
+        elif name.find("mlp.experts") != -1:
+            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    yield from super().modify_tensors(data_torch, new_name, bid)
+
+            return
+
+        new_name = self.map_tensor_name(name)
+
+        if new_name == output_name and self.hparams.get("norm_head"):
+            data_torch = data_torch.float()
+            data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
+
+        yield from super().modify_tensors(data_torch, new_name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("BailingMoeV2ForCausalLM")
+class BailingMoeV2Model(TextModel):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0):
+            self.block_count = self.hparams["num_hidden_layers"] + nextn_layers
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
+        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
+        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+
+        if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(nextn_layers)
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+
+        if name.endswith(".expert_bias"):
+            name = name.replace(".expert_bias", ".expert_bias.bias")
+
+        return super().filter_tensors((name, gen))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if "mlp.experts" in name:
+            n_experts = self.find_hparam(["num_local_experts", "num_experts"])
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    yield from super().modify_tensors(data_torch, merged_name, bid)
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("SarvamMoEForCausalLM", "modeling_sarvam_moe.SarvamMoEForCausalLM")
+class SarvamMoEModel(BailingMoeV2Model):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE2
+    # Sarvam-MoE shares the BailingMoeV2 architecture; only differences:
+    #  - full rotary (no partial_rotary_factor)
+    #  - expert bias is zero-mean normalized at load time
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if (rope_dim := hparams.get("head_dim")) is None:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        # Override the partial-rotary value written by BailingMoeV2 with the full rotary dim
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+        if name.endswith(".expert_bias"):
+            # Sarvam normalizes expert bias to zero mean
+            inner = gen
+
+            def gen():
+                t = inner()
+                return t - t.mean()
+        return super().filter_tensors((name, gen))
--- a/conversion/base.py
+++ b/conversion/base.py
--- a/Show More
+++ b/Show More