common: update env var name (#17588 )

CUDA: add stream-based concurrency (#16991 )
* CUDA: add stream-based concurrency * HIP: fix hipStreamWaitEvent define and nodiscard warnings * ggml-cuda: fix fusion inside stream * ggml-cuda: fix bug w.r.t first stream launch * ggml-cuda: format * ggml-cuda: improve assert message * ggml-cuda: use lambda instead of duplicating code * ggml-cuda: add some more comments * ggml-cuda: add more detailed comments about concurrency * ggml-cuda: rename + remove unused var * ggml-cuda: fix condition for stream launch * ggml-cuda: address review comments, add destructor * common.cuh: add is_valid for concurrent events * common.cuh: make comment better * update comment Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * update comment Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * common.cuh: fix lower_bound condition + remove join_node data from write_ranges * ggml-cuda: fix overlap condition + shadowing parameter --------- Co-authored-by: Carl Philipp Klemm <carl@uvos.xyz> Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2026-02-05 13:53:23 +02:00 · 2025-11-30 09:59:25 +08:00 · 2025-11-30 08:17:55 +08:00 · 2025-11-30 08:16:28 +08:00 · 2025-11-30 01:03:21 +01:00 · 2025-11-29 22:04:44 +01:00
1030 changed files with 223708 additions and 76067 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -17,6 +17,7 @@ Checks: >
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
+    -performance-enum-size,
    portability-*,
    -portability-simd-intrinsics,
    misc-*,
--- a/.devops/cann.Dockerfile
+++ b/.devops/cann.Dockerfile
@@ -3,7 +3,8 @@
 # ==============================================================================

 # Define the CANN base image for easier version updates later
-ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
+ARG CHIP_TYPE=910b
+ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.3.rc1.alpha001-${CHIP_TYPE}-openeuler22.03-py3.11

 # ==============================================================================
 # BUILD STAGE
@@ -11,9 +12,6 @@ ARG CANN_BASE_IMAGE=quay.io/ascend/cann:8.1.rc1-910b-openeuler22.03-py3.10
 # ==============================================================================
 FROM ${CANN_BASE_IMAGE} AS build

-# Define the Ascend chip model for compilation. Default is Ascend910B3
-ARG ASCEND_SOC_TYPE=Ascend910B3
-
 # -- Install build dependencies --
 RUN yum install -y gcc g++ cmake make git libcurl-devel python3 python3-pip && \
    yum clean all && \
@@ -36,20 +34,21 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 # For brevity, only core variables are listed here. You can paste the original ENV list here.

 # -- Build llama.cpp --
-# Use the passed ASCEND_SOC_TYPE argument and add general build options
+# Use the passed CHIP_TYPE argument and add general build options
+ARG CHIP_TYPE
 RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh --force \
    && \
    cmake -B build \
        -DGGML_CANN=ON \
        -DCMAKE_BUILD_TYPE=Release \
-        -DSOC_TYPE=${ASCEND_SOC_TYPE} \
+        -DSOC_TYPE=ascend${CHIP_TYPE} \
        . && \
    cmake --build build --config Release -j$(nproc)

 # -- Organize build artifacts for copying in later stages --
 # Create a lib directory to store all .so files
 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 # Create a full directory to store all executables and Python scripts
 RUN mkdir -p /app/full && \
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@@ -20,7 +20,7 @@ RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \
    cmake --build build -j $(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@@ -25,7 +25,7 @@ RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,8 +1,8 @@
-ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04

 ## Build Image

-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@@ -21,7 +21,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -31,7 +31,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
+FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.2.0
+ARG MUSA_VERSION=rc4.3.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

@@ -32,7 +32,7 @@ RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -34,6 +34,7 @@
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
  enableCurl ? true,
  useVulkan ? false,
+  useRpc ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

  # It's necessary to consistently use backendStdenv when building with CUDA support,
@@ -128,10 +129,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  };

  postPatch = ''
-    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
-      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
-      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';

  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
@@ -179,6 +176,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      (cmakeBool "GGML_METAL" useMetalKit)
      (cmakeBool "GGML_VULKAN" useVulkan)
      (cmakeBool "GGML_STATIC" enableStatic)
+      (cmakeBool "GGML_RPC" useRpc)
    ]
    ++ optionals useCuda [
      (
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,8 +1,8 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=6.4
-ARG AMDGPU_VERSION=6.4
+ARG ROCM_VERSION=7.0
+ARG AMDGPU_VERSION=7.0

 # Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
@@ -13,9 +13,8 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
-# gfx906 is deprecated
-#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
+# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
+# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html

 ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
 #ARG ROCM_DOCKER_ARCH='gfx1151'
@@ -36,20 +35,17 @@ WORKDIR /app

 COPY . .

-RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1
-
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build \
        -DGGML_HIP=ON \
        -DGGML_HIP_ROCWMMA_FATTN=ON \
-        -DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \
        -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
        -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
        -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
-    && find build -name "*.so" -exec cp {} /app/lib \;
+    && find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -0,0 +1,126 @@
+ARG GCC_VERSION=15.2.0
+ARG UBUNTU_VERSION=24.04
+
+### Build Llama.cpp stage
+FROM gcc:${GCC_VERSION} AS build
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+    apt update -y && \
+    apt upgrade -y && \
+    apt install -y --no-install-recommends \
+        git cmake ccache ninja-build \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        libopenblas-dev libcurl4-openssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY . .
+
+RUN --mount=type=cache,target=/root/.ccache \
+    --mount=type=cache,target=/app/build \
+    cmake -S . -B build -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_NATIVE=OFF \
+        -DGGML_BACKEND_DL=ON \
+        -DGGML_CPU_ALL_VARIANTS=ON \
+        -DGGML_BLAS=ON \
+        -DGGML_BLAS_VENDOR=OpenBLAS && \
+    cmake --build build --config Release -j $(nproc) && \
+    cmake --install build --prefix /opt/llama.cpp
+
+COPY *.py             /opt/llama.cpp/bin
+COPY .devops/tools.sh /opt/llama.cpp/bin
+
+COPY gguf-py          /opt/llama.cpp/gguf-py
+COPY requirements.txt /opt/llama.cpp/gguf-py
+COPY requirements     /opt/llama.cpp/gguf-py/requirements
+
+
+### Collect all llama.cpp binaries, libraries and distro libraries
+FROM scratch AS collector
+
+# Copy llama.cpp binaries and libraries
+COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+
+
+### Base image
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+    apt update -y && \
+    apt install -y --no-install-recommends \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
+        curl libgomp1 libopenblas-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+# Copy llama.cpp libraries
+COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
+
+
+### Full
+FROM base AS full
+
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
+    apt update -y && \
+    apt install -y \
+        git cmake libjpeg-dev \
+        python3 python3-pip python3-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+
+COPY --from=collector /llama.cpp/bin /app
+COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+
+RUN pip install --no-cache-dir --break-system-packages \
+        -r /app/gguf-py/requirements.txt
+
+ENTRYPOINT [ "/app/tools.sh" ]
+
+
+### CLI Only
+FROM base AS light
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
+
+
+### Server
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+
+EXPOSE 8080
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@@ -1,42 +1,24 @@
-ARG UBUNTU_VERSION=24.04
+ARG UBUNTU_VERSION=26.04

 FROM ubuntu:$UBUNTU_VERSION AS build

-# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
-
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget xz-utils

-# Install Vulkan SDK
-ARG VULKAN_VERSION=1.4.321.1
-RUN ARCH=$(uname -m) && \
-    wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
-    mkdir -p /opt/vulkan && \
-    tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
-    mv /tmp/${ARCH}/* /opt/vulkan/ && \
-    rm -rf /tmp/*
-
 # Install cURL and Vulkan SDK dependencies
 RUN apt install -y libcurl4-openssl-dev curl \
-    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
-
-# Set environment variables
-ENV VULKAN_SDK=/opt/vulkan
-ENV PATH=$VULKAN_SDK/bin:$PATH
-ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
-ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
-ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
+    libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libvulkan-dev glslc

 # Build it
 WORKDIR /app

 COPY . .

-RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1  -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \
    cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib && \
-    find build -name "*.so" -exec cp {} /app/lib \;
+    find build -name "*.so*" -exec cp -P {} /app/lib \;

 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
@@ -50,7 +32,7 @@ RUN mkdir -p /app/full \
 FROM ubuntu:$UBUNTU_VERSION AS base

 RUN apt-get update \
-    && apt-get install -y libgomp1 curl libvulkan-dev \
+    && apt-get install -y libgomp1 curl libvulkan1 mesa-vulkan-drivers \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
@@ -68,6 +50,7 @@ WORKDIR /app

 RUN apt-get update \
    && apt-get install -y \
+    build-essential \
    git \
    python3 \
    python3-pip \
--- a/.editorconfig
+++ b/.editorconfig
@@ -60,3 +60,11 @@ end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[benches/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
--- a/.github/actions/install-exe/action.yml
+++ b/.github/actions/install-exe/action.yml
@@ -0,0 +1,36 @@
+name: "Install exe"
+description: "Download and install exe"
+inputs:
+  url:
+    description: "URL of the exe installer"
+    required: true
+  args:
+    description: "Installer arguments"
+    required: true
+  timeout:
+    description: "Timeout (in ms)"
+    required: false
+    default: "600000"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Install EXE
+      shell: pwsh
+      run: |
+        $ErrorActionPreference = "Stop"
+        write-host "Downloading Installer EXE"
+        Invoke-WebRequest -Uri "${{ inputs.url }}" -OutFile "${env:RUNNER_TEMP}\temp-install.exe"
+        write-host "Installing"
+        $proc = Start-Process "${env:RUNNER_TEMP}\temp-install.exe" -ArgumentList '${{ inputs.args }}' -NoNewWindow -PassThru
+        $completed = $proc.WaitForExit(${{ inputs.timeout }})
+        if (-not $completed) {
+            Write-Error "Installer timed out. Killing the process"
+            $proc.Kill()
+            exit 1
+        }
+        if ($proc.ExitCode -ne 0) {
+            Write-Error "Installer failed with exit code $($proc.ExitCode)"
+            exit 1
+        }
+        write-host "Completed installation"
--- a/.github/actions/linux-setup-spacemit/action.yml
+++ b/.github/actions/linux-setup-spacemit/action.yml
@@ -0,0 +1,20 @@
+name: "Linux - Setup SpacemiT Toolchain"
+description: "Setup SpacemiT Toolchain for Linux"
+inputs:
+  path:
+    description: "Installation path"
+    required: true
+  version:
+    description: "SpacemiT toolchain version"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Setup SpacemiT Toolchain
+      id: setup
+      uses: ./.github/actions/unarchive-tar
+      with:
+        url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
+        path: ${{ inputs.path }}
+        strip: 1
--- a/.github/actions/linux-setup-vulkan/action.yml
+++ b/.github/actions/linux-setup-vulkan/action.yml
@@ -0,0 +1,20 @@
+name: "Linux - Setup Vulkan SDK"
+description: "Setup Vulkan SDK for Linux"
+inputs:
+  path:
+    description: "Installation path"
+    required: true
+  version:
+    description: "Vulkan SDK version"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Setup Vulkan SDK
+      id: setup
+      uses: ./.github/actions/unarchive-tar
+      with:
+        url: https://sdk.lunarg.com/sdk/download/${{ inputs.version }}/linux/vulkan_sdk.tar.xz
+        path: ${{ inputs.path }}
+        strip: 1
--- a/.github/actions/unarchive-tar/action.yml
+++ b/.github/actions/unarchive-tar/action.yml
@@ -0,0 +1,27 @@
+name: "Unarchive tar"
+description: "Download and unarchive tar into directory"
+inputs:
+  url:
+    description: "URL of the tar archive"
+    required: true
+  path:
+    description: "Directory to unarchive into"
+    required: true
+  type:
+    description: "Compression type (tar option)"
+    required: false
+    default: "J"
+  strip:
+    description: "Strip components"
+    required: false
+    default: "0"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Unarchive into directory
+      shell: bash
+      run: |
+        mkdir -p ${{ inputs.path }}
+        cd ${{ inputs.path }}
+        curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
--- a/.github/actions/windows-setup-rocm/action.yml
+++ b/.github/actions/windows-setup-rocm/action.yml
@@ -0,0 +1,15 @@
+name: "Windows - Setup ROCm"
+description: "Setup ROCm for Windows"
+inputs:
+  version:
+    description: "ROCm version"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Setup ROCm
+      uses: ./.github/actions/install-exe
+      with:
+        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
+        args: -install
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -9,7 +9,7 @@ llama.cpp is a large-scale C/C++ project for efficient LLM (Large Language Model
 - **Size**: ~200k+ lines of code across 1000+ files
 - **Architecture**: Modular design with main library (`libllama`) and 40+ executable tools/examples
 - **Core dependency**: ggml tensor library (vendored in `ggml/` directory)
- **Backends supported**: CPU (AVX/NEON optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
+- **Backends supported**: CPU (AVX/NEON/RVV optimized), CUDA, Metal, Vulkan, SYCL, ROCm, MUSA
 - **License**: MIT

 ## Build Instructions
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -76,6 +76,10 @@ ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/**
+model:
+    - changed-files:
+        - any-glob-to-any-file:
+            - src/models/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -0,0 +1,89 @@
+name: Build Actions Cache
+
+on:
+  workflow_dispatch: # allows manual triggering
+  schedule:
+    - cron: '0 * * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-24-vulkan-cache:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Get latest Vulkan SDK version
+        id: vulkan_sdk_version
+        run: |
+          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
+
+      - name: Setup Cache
+        uses: actions/cache@v4
+        id: cache-sdk
+        with:
+          path: ./vulkan_sdk
+          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
+
+      - name: Setup Vulkan SDK
+        if: steps.cache-sdk.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-vulkan
+        with:
+          path: ./vulkan_sdk
+          version: ${{ env.VULKAN_SDK_VERSION }}
+
+  ubuntu-24-spacemit-cache:
+    runs-on: ubuntu-24.04
+
+    env:
+      # Make sure this is in sync with build-linux-cross.yml
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Cache
+        uses: actions/cache@v4
+        id: cache-toolchain
+        with:
+          path: ./spacemit_toolchain
+          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+
+      - name: Setup SpacemiT Toolchain
+        if: steps.cache-toolchain.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-spacemit
+        with:
+          path: ./spacemit_toolchain
+          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
+
+  windows-2022-rocm-cache:
+    runs-on: windows-2022
+
+    env:
+      # Make sure this is in sync with build.yml
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Cache
+        uses: actions/cache@v4
+        id: cache-rocm
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Setup ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        uses: ./.github/actions/windows-setup-rocm
+        with:
+          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -4,49 +4,49 @@ on:
  workflow_call:

 jobs:
-  ubuntu-24-riscv64-cpu-cross:
-    runs-on: ubuntu-24.04
+  # ubuntu-24-riscv64-cpu-cross:
+  #   runs-on: ubuntu-24.04

-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo dpkg --add-architecture riscv64

-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
+  #         # Add arch-specific repositories for non-amd64 architectures
+  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+  #         EOF

-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.

-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu

-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+  #     - name: Build
+  #       run: |
+  #         cmake -B build -DLLAMA_CURL=OFF \
+  #                        -DCMAKE_BUILD_TYPE=Release \
+  #                        -DGGML_OPENMP=OFF \
+  #                        -DLLAMA_BUILD_EXAMPLES=ON \
+  #                        -DLLAMA_BUILD_TOOLS=ON \
+  #                        -DLLAMA_BUILD_TESTS=OFF \
+  #                        -DCMAKE_SYSTEM_NAME=Linux \
+  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

-          cmake --build build --config Release -j $(nproc)
+  #         cmake --build build --config Release -j $(nproc)

  # ubuntu-24-riscv64-vulkan-cross:
  #   runs-on: ubuntu-24.04
@@ -141,97 +141,6 @@ jobs:

  #         cmake --build build --config Release -j $(nproc)

-  ubuntu-24-ppc64el-cpu-cross:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup PowerPC64le
-        run: |
-          sudo dpkg --add-architecture ppc64el
-
-          # Add arch-specific repositories for non-amd64 architectures
-          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-          EOF
-
-          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-powerpc64le-linux-gnu \
-                  g++-14-powerpc64le-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  # ubuntu-24-ppc64el-vulkan-cross:
-  #   runs-on: ubuntu-24.04
-
-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - name: Setup PowerPC64le
-  #       run: |
-  #         sudo dpkg --add-architecture ppc64el
-
-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
-
-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
-
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 glslc \
-  #                 gcc-14-powerpc64le-linux-gnu \
-  #                 g++-14-powerpc64le-linux-gnu \
-  #                 libvulkan-dev:ppc64el
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_VULKAN=ON \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
-  #                        -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-  #         cmake --build build --config Release -j $(nproc)
-
  debian-13-loongarch64-cpu-cross:
    runs-on: ubuntu-24.04
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
@@ -344,3 +253,45 @@ jobs:
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
+
+  ubuntu-24-riscv64-cpu-spacemit-ime-cross:
+    runs-on: ubuntu-24.04
+
+    env:
+      # Make sure this is in sync with build-cache.yml
+      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Use SpacemiT Toolchain Cache
+        uses: actions/cache@v4
+        id: cache-toolchain
+        with:
+          path: ./spacemit_toolchain
+          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
+
+      - name: Setup SpacemiT Toolchain
+        if: steps.cache-toolchain.outputs.cache-hit != 'true'
+        uses: ./.github/actions/linux-setup-spacemit
+        with:
+          path: ./spacemit_toolchain
+          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
+
+      - name: Build
+        run: |
+          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
+                         -DGGML_RVV=ON \
+                         -DGGML_RV_ZFH=ON \
+                         -DGGML_RV_ZICBOP=ON \
+                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+
+          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -6,7 +6,7 @@ on:

 jobs:
  debian-13-riscv64-native: # Bianbu 2.2
-    runs-on: self-hosted
+    runs-on: [self-hosted, RISCV64]

    steps:
      - name: Install prerequisites
@@ -58,3 +58,63 @@ jobs:
            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
+
+  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
+  #   runs-on: [self-hosted, RISCV64]
+
+  #   steps:
+  #     - name: Install prerequisites
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y libatomic1
+  #     - uses: actions/checkout@v4
+  #     - name: Setup Riscv
+  #       run: |
+  #         sudo apt-get update || true
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 gcc-14-riscv64-linux-gnu \
+  #                 g++-14-riscv64-linux-gnu \
+  #                 ccache \
+  #                 cmake
+  #         sudo apt-get upgrade binutils -y
+
+  #     - name: Setup ccache
+  #       run: |
+  #         mkdir -p $HOME/.ccache
+  #         ccache -M 5G -d $HOME/.ccache
+  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
+  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
+  #         echo "$GITHUB_WORKSPACE"
+  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
+  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
+  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
+  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
+
+  #     - name: Build
+  #       run: |
+  #         cmake -B build \
+  #           -DLLAMA_CURL=OFF \
+  #           -DCMAKE_BUILD_TYPE=Release \
+  #           -DGGML_OPENMP=OFF \
+  #           -DLLAMA_BUILD_EXAMPLES=ON \
+  #           -DLLAMA_BUILD_TOOLS=ON \
+  #           -DLLAMA_BUILD_TESTS=OFF \
+  #           -DCMAKE_SYSTEM_NAME=Linux \
+  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
+  #           -DGGML_RVV=ON \
+  #           -DGGML_RV_ZFH=ON \
+  #           -DGGML_RV_ZICBOP=ON \
+  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
+  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
+
+  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/check-vendor.yml
+++ b/.github/workflows/check-vendor.yml
@@ -0,0 +1,52 @@
+name: Check vendor
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      'vendor/**',
+      'scripts/sync_vendor.py'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      'vendor/**',
+      'scripts/sync_vendor.py'
+    ]
+
+jobs:
+  check-vendor:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.x'
+
+      - name: Run vendor sync
+        run: |
+          set -euo pipefail
+          python3 scripts/sync_vendor.py
+
+      - name: Check for changes
+        run: |
+          set -euo pipefail
+          # detect modified or untracked files
+          changed=$(git status --porcelain --untracked-files=all || true)
+          if [ -n "$changed" ]; then
+            echo "Vendor sync modified files:"
+            echo "$changed" | awk '{ print $2 }' | sed '/^$/d'
+            echo "Failing because vendor files mismatch. Please update scripts/sync_vendor.py"
+            exit 1
+          else
+            echo "Vendor files are up-to-date."
+          fi
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -28,7 +28,7 @@ jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub

-    runs-on: ubuntu-22.04
+    runs-on: ${{ matrix.config.runs_on }}
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
@@ -39,11 +39,12 @@ jobs:
          # Note: the arm64 images are failing, which prevents the amd64 images from being built
          # https://github.com/ggml-org/llama.cpp/issues/11888
          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
+          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
    steps:
@@ -53,6 +54,7 @@ jobs:
          fetch-depth: 0 # preserve git history, so we can determine the build number

      - name: Set up QEMU
+        if: ${{ matrix.config.tag != 's390x' }}
        uses: docker/setup-qemu-action@v3
        with:
          image: tonistiigi/binfmt:qemu-v7.0.0-28
@@ -67,22 +69,19 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Determine tag name
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
+        env:
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Determine image tag name
        id: tag
        shell: bash
        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
          REPO_NAME="${{ github.event.repository.name }}"

-          # determine tag name postfix (build number, commit hash)
-          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
-            TAG_POSTFIX="-b${BUILD_NUMBER}"
-          else
-            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
-            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
-          fi
          # list all tags possible
          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
              TYPE=""
@@ -90,17 +89,19 @@ jobs:
              TYPE="-${{ matrix.config.tag }}"
          fi
          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
+          CACHETAGS="${PREFIX}buildcache${TYPE}"
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
+          echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
+          echo "cache_output_tags=$CACHETAGS"  # print out for debugging
          echo "full_output_tags=$FULLTAGS"  # print out for debugging
          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
-          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

      - name: Free Disk Space (Ubuntu)
@@ -133,11 +134,14 @@ jobs:
          target: full
          provenance: false
          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

      - name: Build and push Light Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
@@ -152,11 +156,14 @@ jobs:
          target: light
          provenance: false
          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
@@ -171,8 +178,37 @@ jobs:
          target: server
          provenance: false
          # using github experimental cache
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
+          #cache-from: type=gha
+          #cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
+          # using registry cache (no storage limit)
+          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
+          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
+
+  create_tag:
+    name: Create and push git tag
+    runs-on: ubuntu-22.04
+    permissions:
+      contents: write
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Determine source tag name
+        id: srctag
+        uses: ./.github/actions/get-tag-name
+        env:
+          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+      - name: Create and push git tag
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          git tag ${{ steps.srctag.outputs.name }} || exit 0
+          git push origin ${{ steps.srctag.outputs.name }} || exit 0
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -75,7 +75,7 @@ jobs:
          name: llama-bin-macos-arm64.zip

  macOS-x64:
-    runs-on: macos-13
+    runs-on: macos-15-intel

    steps:
      - name: Clone
@@ -134,6 +134,8 @@ jobs:
        include:
          - build: 'x64'
            os: ubuntu-22.04
+          - build: 's390x'
+            os: ubuntu-24.04-s390x
          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
          # - build: 'arm64'
          #   os: ubuntu-22.04-arm
@@ -150,7 +152,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-cpu-cmake
+          key: ubuntu-cpu-cmake-${{ matrix.build }}
          evict-old-files: 1d

      - name: Dependencies
@@ -462,7 +464,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"

@@ -505,6 +507,7 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
@@ -513,10 +516,15 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin

+          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
+
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-bin-win-sycl-x64.zip ./build/bin/*

@@ -543,10 +551,12 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      - name: Clone rocWMMA repository
-        id: clone_rocwmma
+      - name: Grab rocWMMA package
+        id: grab_rocwmma
        run: |
-          git clone https://github.com/rocm/rocwmma --branch develop --depth 1
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
+          7z x rocwmma.deb
+          7z x data.tar

      - name: Cache ROCm Installation
        id: cache-rocm
@@ -601,7 +611,7 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
@@ -683,6 +693,51 @@ jobs:
          path: llama-${{ steps.tag.outputs.name }}-xcframework.zip
          name: llama-${{ steps.tag.outputs.name }}-xcframework

+  openEuler-cann:
+    strategy:
+      matrix:
+        arch: [x86, aarch64]
+        chip_type: ['910b', '310p']
+        build: ['Release']
+    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}
+    container: ascendai/cann:${{ matrix.chip_type == '910b' &&  '8.3.rc1.alpha001-910b-openeuler22.03-py3.11' || '8.2.rc1-310p-openeuler22.03-py3.11' }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Dependencies
+        run: |
+          yum update -y
+          yum install -y git gcc gcc-c++ make cmake libcurl-devel
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Build
+        run: |
+          export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
+
+          cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+              -DGGML_CANN=on \
+              -DSOC_TYPE=ascend${{ matrix.chip_type }}
+          cmake --build build -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip ./build/bin/*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+          name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@@ -704,6 +759,7 @@ jobs:
      - macOS-arm64
      - macOS-x64
      - ios-xcode-build
+      - openEuler-cann

    steps:
      - name: Clone
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -56,7 +56,7 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libcurl4-openssl-dev
+            libssl-dev

      - name: Clone
        id: checkout
@@ -209,7 +209,7 @@ jobs:
        working-directory: tools/server/webui

      - name: Run UI tests
-        run: npm run test:ui
+        run: npm run test:ui -- --testTimeout=60000
        working-directory: tools/server/webui

      - name: Run E2E tests
@@ -242,7 +242,7 @@ jobs:
            curl \
            wget \
            language-pack-en \
-            libcurl4-openssl-dev
+            libssl-dev

      - name: Clone
        id: checkout
@@ -283,6 +283,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
@@ -295,6 +297,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
@@ -306,6 +310,8 @@ jobs:
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
+              -DLLAMA_CURL=OFF \
+              -DLLAMA_OPENSSL=ON \
              -DLLAMA_BUILD_SERVER=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
@@ -345,16 +351,10 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
      - name: Build
        id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
-          cmake -B build -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
+          cmake -B build -DLLAMA_CURL=OFF -DLLAMA_BUILD_BORINGSSL=ON
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server

      - name: Python setup
@@ -368,13 +368,6 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt

-      - name: Copy Libcurl
-        id: prepare_libcurl
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
-        run: |
-          cp $env:CURL_PATH/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
-
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
--- a/.github/workflows/update-ops-docs.yml
+++ b/.github/workflows/update-ops-docs.yml
@@ -3,10 +3,12 @@ name: Update Operations Documentation
 on:
    push:
        paths:
+            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'
    pull_request:
        paths:
+            - 'docs/ops.md'
            - 'docs/ops/**'
            - 'scripts/create_ops_docs.py'

--- a/.gitignore
+++ b/.gitignore
@@ -20,52 +20,40 @@
 *.so
 *.swp
 *.tmp
+*.DS_Store

 # IDE / OS

-.cache/
-.ccls-cache/
-.direnv/
-.DS_Store
-.envrc
-.idea/
-.swiftpm
-.vs/
-.vscode/
-nppBackup
+/.cache/
+/.ccls-cache/
+/.direnv/
+/.envrc
+/.idea/
+/.swiftpm
+/.vs/
+/.vscode/
+/nppBackup


 # Coverage

-gcovr-report/
-lcov-report/
+/gcovr-report/
+/lcov-report/

 # Build Artifacts

-tags
-.build/
-build*
-release
-debug
-!build-info.cmake
-!build-info.cpp.in
-!build-info.sh
-!build.zig
-!docs/build.md
+/tags
+/.build/
+/build*
+/release
+/debug
 /libllama.so
 /llama-*
 /vulkan-shaders-gen
-android-ndk-*
-arm_neon.h
-cmake-build-*
-CMakeSettings.json
-compile_commands.json
-ggml-metal-embed.metal
-llama-batched-swift
 /rpc-server
-out/
-tmp/
-autogen-*.md
+/out/
+/tmp/
+/autogen-*.md

 # Deprecated

@@ -74,44 +62,38 @@ autogen-*.md

 # CI

-!.github/workflows/*.yml
+!/.github/workflows/*.yml

 # Models

-models/*
-models-mnt
-!models/.editorconfig
-!models/ggml-vocab-*.gguf*
-!models/templates
+/models/*
+/models-mnt
+!/models/.editorconfig
+!/models/ggml-vocab-*.gguf*
+!/models/templates

 # Zig
-zig-out/
-zig-cache/
-
-# Logs
-
-ppl-*.txt
-qnt-*.txt
-perf-*.txt
+/zig-out/
+/zig-cache/

 # Examples

-examples/jeopardy/results.txt
-tools/server/*.css.hpp
-tools/server/*.html.hpp
-tools/server/*.js.hpp
-tools/server/*.mjs.hpp
-tools/server/*.gz.hpp
-!build_64.sh
-!examples/*.bat
-!examples/*/*.kts
-!examples/*/*/*.kts
-!examples/sycl/*.bat
-!examples/sycl/*.sh
+/examples/jeopardy/results.txt
+/tools/server/*.css.hpp
+/tools/server/*.html.hpp
+/tools/server/*.js.hpp
+/tools/server/*.mjs.hpp
+/tools/server/*.gz.hpp
+!/build_64.sh
+!/examples/*.bat
+!/examples/*/*.kts
+!/examples/*/*/*.kts
+!/examples/sycl/*.bat
+!/examples/sycl/*.sh

 # Server Web UI temporary files
-node_modules
-tools/server/webui/dist
+/tools/server/webui/node_modules
+/tools/server/webui/dist

 # Python

@@ -147,8 +129,8 @@ poetry.toml
 # Local scripts
 /run-vim.sh
 /run-chat.sh
-.ccache/
-
-# Code Workspace
-*.code-workspace
+/.ccache/

+# IDE
+/*.code-workspace
+/.windsurf/
--- a/.windsurf/rules/css-architecture.md
+++ b/.windsurf/rules/css-architecture.md
@@ -1,7 +0,0 @@
---
-trigger: manual
---
-
-#### Tailwind & CSS
-
-   We are using Tailwind v4 which uses oklch colors so we now want to refer to the CSS vars directly, without wrapping it with any color function like `hsla/hsl`, `rgba` etc.
--- a/.windsurf/rules/sveltekit-architecture.md
+++ b/.windsurf/rules/sveltekit-architecture.md
@@ -1,48 +0,0 @@
---
-trigger: manual
---
-
-# Coding rules
-
-## Svelte & SvelteKit
-
-### Services vs Stores Separation Pattern
-
-#### `lib/services/` - Pure Business Logic
-
-   **Purpose**: Stateless business logic and external communication
-   **Contains**:
-    -   API calls to external services (ApiService)
-    -   Pure business logic functions (ChatService, etc.)
-   **Rules**:
-    -   NO Svelte runes ($state, $derived, $effect)
-    -   NO reactive state management
-    -   Pure functions and classes only
-    -   Can import types but not stores
-    -   Focus on "how" - implementation details
-
-#### `lib/stores/` - Reactive State Management
-
-   **Purpose**: Svelte-specific reactive state with runes
-   **Contains**:
-    -   Reactive state classes with $state, $derived, $effect
-    -   Database operations (DatabaseStore)
-    -   UI-focused state management
-    -   Store orchestration logic
-   **Rules**:
-    -   USE Svelte runes for reactivity
-    -   Import and use services for business logic
-    -   NO direct database operations
-    -   NO direct API calls (use services)
-    -   Focus on "what" - reactive state for UI
-
-#### Enforcement
-
-   Services should be testable without Svelte
-   Stores should leverage Svelte's reactivity system
-   Clear separation: services handle data, stores handle state
-   Services can be reused across multiple stores
-
-#### Misc
-
-   Always use `let` for $derived state variables
--- a/.windsurf/rules/tests.md
+++ b/.windsurf/rules/tests.md
@@ -1,9 +0,0 @@
---
-trigger: manual
---
-
-# Automated Tests
-
-## General rules
-
-   NEVER include any test code in the production code - we should always have it in a separate dedicated files
--- a/.windsurf/rules/typescript-architecture.md
+++ b/.windsurf/rules/typescript-architecture.md
@@ -1,7 +0,0 @@
---
-trigger: manual
---
-
-## TypeScript
-
-   Add JSDocs for functions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,6 +92,8 @@ option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_

 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
+option(LLAMA_HTTPLIB    "llama: if libcurl is disabled, use httplib to download model from an URL" ON)
+option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

 # Required for relocatable CMake package
@@ -199,6 +201,9 @@ endif()

 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
+    if (LLAMA_HTTPLIB)
+        add_subdirectory(vendor/cpp-httplib)
+    endif()
 endif()

 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
--- a/111
+++ b/111
@@ -1,12 +1,103 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
+# multiplie collaborators per item can be specified

-/ci/ @ggerganov
-/.devops/*.Dockerfile @ngxson
-/tools/server/ @ngxson
-/ggml/src/ggml-cuda/fattn* @JohannesGaessler
-/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
-/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
-/ggml/src/ggml-opt.cpp @JohannesGaessler
-/ggml/src/gguf.cpp @JohannesGaessler
-/ggml/src/ggml-vulkan/ @0cc4m
-/ggml/src/ggml-zdnn/ @taronaeo
+/.devops/*.Dockerfile                   @ngxson
+/.github/actions/                       @CISC
+/.github/workflows/                     @CISC
+/ci/                                    @ggerganov
+/cmake/                                 @ggerganov
+/common/CMakeLists.txt                  @ggerganov
+/common/arg.*                           @ggerganov @ericcurtin
+/common/base64.hpp.*                    @ggerganov
+/common/build-info.*                    @ggerganov
+/common/common.*                        @ggerganov
+/common/console.*                       @ggerganov
+/common/http.*                          @angt
+/common/llguidance.*                    @ggerganov
+/common/log.*                           @ggerganov
+/common/sampling.*                      @ggerganov
+/common/speculative.*                   @ggerganov
+/convert_*.py                           @CISC
+/examples/batched.swift/                @ggerganov
+/examples/batched/                      @ggerganov
+/examples/convert-llama2c-to-ggml/      @ggerganov
+/examples/deprecation-warning/          @ggerganov
+/examples/diffusion/                    @am17an
+/examples/embedding/                    @ggerganov
+/examples/eval-callback/                @ggerganov
+/examples/export-docs/                  @ggerganov
+/examples/gen-docs/                     @ggerganov
+/examples/gguf/                         @ggerganov
+/examples/llama.android/                @ggerganov
+/examples/llama.swiftui/                @ggerganov
+/examples/llama.vim                     @ggerganov
+/examples/lookahead/                    @ggerganov
+/examples/lookup/                       @JohannesGaessler
+/examples/model-conversion/             @danbev
+/examples/parallel/                     @ggerganov
+/examples/passkey/                      @ggerganov
+/examples/retrieval/                    @ggerganov
+/examples/save-load-state/              @ggerganov
+/examples/speculative-simple/           @ggerganov
+/examples/speculative/                  @ggerganov
+/ggml/cmake/                            @ggerganov
+/ggml/include/                          @ggerganov
+/ggml/src/ggml-common.h                 @ggerganov
+/ggml/src/ggml-cpu/                     @ggerganov
+/ggml/src/ggml-cpu/spacemit/            @alex-spacemit
+/ggml/src/ggml-cuda/fattn*              @JohannesGaessler
+/ggml/src/ggml-cuda/mmf.*               @JohannesGaessler @am17an
+/ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
+/ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
+/ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
+/ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
+/ggml/src/ggml-hip/                     @IMbackK
+/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
+/ggml/src/ggml-impl.h                   @ggerganov
+/ggml/src/ggml-metal/                   @ggerganov
+/ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
+/ggml/src/ggml-hexagon/                 @max-krasnyansky @lhez
+/ggml/src/ggml-opt.cpp                  @JohannesGaessler
+/ggml/src/ggml-quants.*                 @ggerganov
+/ggml/src/ggml-rpc/                     @rgerganov
+/ggml/src/ggml-threading.*              @ggerganov
+/ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-webgpu/                  @reeselevine
+/ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
+/ggml/src/ggml.c                        @ggerganov
+/ggml/src/ggml.cpp                      @ggerganov
+/ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
+/gguf-py/                               @CISC
+/media/                                 @ggerganov
+/scripts/gen*                           @ggerganov
+/scripts/get*                           @ggerganov
+/scripts/sync*                          @ggerganov
+/src/                                   @ggerganov
+/src/llama-adapter.*                    @CISC
+/src/llama-arch.*                       @CISC
+/src/llama-chat.*                       @ngxson
+/src/llama-graph.*                      @CISC
+/src/llama-model.*                      @CISC
+/src/llama-vocab.*                      @CISC
+/src/models/                            @CISC
+/tests/                                 @ggerganov
+/tools/batched-bench/                   @ggerganov
+/tools/main/                            @ggerganov
+/tools/mtmd/                            @ngxson
+/tools/perplexity/                      @ggerganov
+/tools/quantize/                        @ggerganov
+/tools/rpc/                             @rgerganov
+/tools/run/                             @ericcurtin
+/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/webui/                    @allozaur
+/tools/tokenize/                        @ggerganov
+/tools/tts/                             @ggerganov
+/vendor/                                @ggerganov
+/AUTHORS                                @ggerganov
+/CMakeLists.txt                         @ggerganov
+/CONTRIBUTING.md                        @ggerganov
+/LICENSE                                @ggerganov
+/README.md                              @ggerganov
+/SECURITY.md                            @ggerganov
+/build-xcframework.sh                   @danbev
+requirements*.txt                       @CISC
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,12 @@
-# Pull requests (for contributors)
+# Contributors
+
+The project differentiates between 3 levels of contributors:
+
+- Contributors: people who have contributed before (no special privileges)
+- Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
+- Maintainers: responsible for reviewing and merging PRs, after approval from the code owners
+
+# Pull requests (for contributors & collaborators)

 - llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
@@ -9,15 +17,16 @@
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs

-# Pull requests (for collaborators)
+# Pull requests (for maintainers)

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
- Let authors, who are also collaborators, merge their own PRs
- When merging a PR by a contributor, make sure you have a good understanding of the changes
+- Let other maintainers merge their own PRs
+- When merging a PR, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

 # Coding guidelines
@@ -117,6 +126,21 @@
    #endif // FOO
    ```

+# Code maintenance
+
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
+  - Reviewing and merging related PRs
+  - Fixing related bugs
+  - Providing developer guidance/support
+
+- When adding or modifying a large piece of code:
+  - If you are a collaborator, make sure to add yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
+  - If you are a contributor, find an existing collaborator who is willing to review and maintain your code long-term
+  - Provide the necessary CI workflow (and hardware) to test your changes (see [ci/README.md](https://github.com/ggml-org/llama.cpp/tree/master/ci))
+
+- New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
+  _(NOTE: for legacy reasons, existing code is not required to follow this guideline)_
+
 # Documentation

 - Documentation is a community effort
--- a/README.md
+++ b/README.md
@@ -17,14 +17,13 @@ LLM inference in C/C++

 ## Hot topics

- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
+- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
+- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
+- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

@@ -62,6 +61,7 @@ range of hardware - locally and in the cloud.
 - Plain C/C++ implementation without any dependencies
 - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
+- RVV, ZVFH, ZFH and ZICBOP support for RISC-V architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)
 - Vulkan and SYCL backend support
@@ -84,6 +84,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [x] [Jamba](https://huggingface.co/ai21labs)
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
@@ -138,6 +139,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
 - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
 - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)
+- [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86)

 #### Multimodal

@@ -178,6 +180,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
+- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
@@ -186,6 +189,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
 - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
 - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi)
+- Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma)

 </details>

@@ -238,6 +242,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 - [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)
+- [unslothai/unsloth](https://github.com/unslothai/unsloth) – 🦥 exports/saves fine-tuned and trained models to GGUF (Apache-2.0)

 </details>

@@ -274,8 +279,10 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
+| [Hexagon [In Progress]](docs/backend/hexagon/README.md) | Snapdragon |

 ## Obtaining and quantizing models

@@ -520,8 +527,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
 ## Contributing

 - Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
+- Maintainers can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Any help with managing issues, PRs and projects is very appreciated!
 - See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.html
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547.json
@@ -0,0 +1,6 @@
+{
+  "chars": 2296.1916666666666,
+  "chars:std": 986.051306946325,
+  "score": 0.925,
+  "score:std": 0.26339134382131846
+}
--- a/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json
+++ b/benches/dgx-spark/aime25_openai__gpt-oss-120b-high_temp1.0_20251109_094547_allresults.json
--- a/benches/dgx-spark/dgx-spark.md
+++ b/benches/dgx-spark/dgx-spark.md
@@ -0,0 +1,264 @@
+## System info
+
+```bash
+uname --all
+Linux spark-17ed 6.11.0-1016-nvidia #16-Ubuntu SMP PREEMPT_DYNAMIC Sun Sep 21 16:52:46 UTC 2025 aarch64 aarch64 aarch64 GNU/Linux
+
+g++ --version
+g++ (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0
+
+nvidia-smi
+Sun Nov  2 10:43:25 2025
+-----------------------------------------------------------------------------------------+
+| NVIDIA-SMI 580.95.05              Driver Version: 580.95.05      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+|                                         |                        |               MIG M. |
+|=========================================+========================+======================|
+|   0  NVIDIA GB10                    On  |   0000000F:01:00.0 Off |                  N/A |
+| N/A   35C    P8              4W /  N/A  | Not Supported          |      0%      Default |
+|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
+```
+
+## ggml-org/gpt-oss-20b-GGUF
+
+Model: https://huggingface.co/ggml-org/gpt-oss-20b-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.374 |  1369.01 |    0.383 |    83.64 |    0.757 |   719.01 |
+|   512 |     32 |    2 |   1088 |    0.274 |  3741.35 |    0.659 |    97.14 |    0.933 |  1166.66 |
+|   512 |     32 |    4 |   2176 |    0.526 |  3896.47 |    0.817 |   156.73 |    1.342 |  1621.08 |
+|   512 |     32 |    8 |   4352 |    1.044 |  3925.10 |    0.987 |   259.44 |    2.030 |  2143.56 |
+|   512 |     32 |   16 |   8704 |    2.076 |  3945.84 |    1.248 |   410.32 |    3.324 |  2618.60 |
+|   512 |     32 |   32 |  17408 |    4.170 |  3929.28 |    1.630 |   628.40 |    5.799 |  3001.76 |
+|  4096 |     32 |    1 |   4128 |    1.083 |  3782.66 |    0.394 |    81.21 |    1.477 |  2795.13 |
+|  4096 |     32 |    2 |   8256 |    2.166 |  3782.72 |    0.725 |    88.28 |    2.891 |  2856.14 |
+|  4096 |     32 |    4 |  16512 |    4.333 |  3780.88 |    0.896 |   142.82 |    5.230 |  3157.38 |
+|  4096 |     32 |    8 |  33024 |    8.618 |  3802.14 |    1.155 |   221.69 |    9.773 |  3379.08 |
+|  4096 |     32 |   16 |  66048 |   17.330 |  3781.73 |    1.598 |   320.34 |   18.928 |  3489.45 |
+|  4096 |     32 |   32 | 132096 |   34.671 |  3780.48 |    2.336 |   438.35 |   37.007 |  3569.51 |
+|  8192 |     32 |    1 |   8224 |    2.233 |  3668.56 |    0.438 |    72.98 |    2.671 |  3078.44 |
+|  8192 |     32 |    2 |  16448 |    4.425 |  3702.95 |    0.756 |    84.66 |    5.181 |  3174.95 |
+|  8192 |     32 |    4 |  32896 |    8.859 |  3698.64 |    0.967 |   132.38 |    9.826 |  3347.72 |
+|  8192 |     32 |    8 |  65792 |   17.714 |  3699.57 |    1.277 |   200.52 |   18.991 |  3464.35 |
+|  8192 |     32 |   16 | 131584 |   35.494 |  3692.84 |    1.841 |   278.12 |   37.335 |  3524.46 |
+|  8192 |     32 |   32 | 263168 |   70.949 |  3694.82 |    2.798 |   365.99 |   73.747 |  3568.53 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |      3714.25 ± 20.36 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         86.58 ± 0.43 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |      3445.17 ± 17.85 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         81.72 ± 0.53 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |      3218.78 ± 11.34 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         74.86 ± 0.64 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       2732.83 ± 7.17 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         71.57 ± 0.51 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |      2119.75 ± 12.81 |
+| gpt-oss 20B MXFP4 MoE          |  11.27 GiB |    20.91 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         62.33 ± 0.24 |
+
+build: eeee367de (6989)
+
+## ggml-org/gpt-oss-120b-GGUF
+
+Model: https://huggingface.co/ggml-org/gpt-oss-120b-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.571 |   897.18 |    0.543 |    58.96 |    1.113 |   488.60 |
+|   512 |     32 |    2 |   1088 |    0.593 |  1725.37 |    1.041 |    61.45 |    1.635 |   665.48 |
+|   512 |     32 |    4 |   2176 |    1.043 |  1963.15 |    1.334 |    95.95 |    2.377 |   915.36 |
+|   512 |     32 |    8 |   4352 |    2.099 |  1951.63 |    1.717 |   149.07 |    3.816 |  1140.45 |
+|   512 |     32 |   16 |   8704 |    4.207 |  1947.12 |    2.311 |   221.56 |    6.518 |  1335.35 |
+|   512 |     32 |   32 |  17408 |    8.422 |  1945.36 |    3.298 |   310.46 |   11.720 |  1485.27 |
+|  4096 |     32 |    1 |   4128 |    2.138 |  1915.88 |    0.571 |    56.09 |    2.708 |  1524.12 |
+|  4096 |     32 |    2 |   8256 |    4.266 |  1920.25 |    1.137 |    56.27 |    5.404 |  1527.90 |
+|  4096 |     32 |    4 |  16512 |    8.564 |  1913.02 |    1.471 |    86.99 |   10.036 |  1645.29 |
+|  4096 |     32 |    8 |  33024 |   17.092 |  1917.19 |    1.979 |   129.33 |   19.071 |  1731.63 |
+|  4096 |     32 |   16 |  66048 |   34.211 |  1915.65 |    2.850 |   179.66 |   37.061 |  1782.15 |
+|  4096 |     32 |   32 | 132096 |   68.394 |  1916.44 |    4.381 |   233.72 |   72.775 |  1815.13 |
+|  8192 |     32 |    1 |   8224 |    4.349 |  1883.45 |    0.620 |    51.65 |    4.969 |  1655.04 |
+|  8192 |     32 |    2 |  16448 |    8.674 |  1888.83 |    1.178 |    54.33 |    9.852 |  1669.48 |
+|  8192 |     32 |    4 |  32896 |   17.351 |  1888.55 |    1.580 |    81.01 |   18.931 |  1737.68 |
+|  8192 |     32 |    8 |  65792 |   34.743 |  1886.31 |    2.173 |   117.80 |   36.916 |  1782.20 |
+|  8192 |     32 |   16 | 131584 |   69.413 |  1888.29 |    3.297 |   155.28 |   72.710 |  1809.70 |
+|  8192 |     32 |   32 | 263168 |  138.903 |  1887.24 |    5.004 |   204.63 |  143.907 |  1828.73 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |       1919.36 ± 5.01 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         60.40 ± 0.30 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       1825.30 ± 6.37 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         56.94 ± 0.29 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |       1739.19 ± 6.00 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         52.51 ± 0.42 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       1536.75 ± 4.27 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         49.33 ± 0.27 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |       1255.85 ± 3.26 |
+| gpt-oss 120B MXFP4 MoE         |  59.02 GiB |   116.83 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         42.99 ± 0.18 |
+
+build: eeee367de (6989)
+
+## ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
+
+Model: https://huggingface.co/ggml-org/Qwen3-Coder-30B-A3B-Instruct-Q8_0-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.398 |  1285.90 |    0.530 |    60.41 |    0.928 |   586.27 |
+|   512 |     32 |    2 |   1088 |    0.386 |  2651.65 |    0.948 |    67.50 |    1.334 |   815.38 |
+|   512 |     32 |    4 |   2176 |    0.666 |  3076.37 |    1.209 |   105.87 |    1.875 |  1160.71 |
+|   512 |     32 |    8 |   4352 |    1.325 |  3091.39 |    1.610 |   158.98 |    2.935 |  1482.65 |
+|   512 |     32 |   16 |   8704 |    2.664 |  3075.58 |    2.150 |   238.19 |    4.813 |  1808.39 |
+|   512 |     32 |   32 |  17408 |    5.336 |  3070.31 |    2.904 |   352.59 |    8.240 |  2112.50 |
+|  4096 |     32 |    1 |   4128 |    1.444 |  2836.81 |    0.581 |    55.09 |    2.025 |  2038.81 |
+|  4096 |     32 |    2 |   8256 |    2.872 |  2852.14 |    1.084 |    59.06 |    3.956 |  2086.99 |
+|  4096 |     32 |    4 |  16512 |    5.744 |  2852.32 |    1.440 |    88.90 |    7.184 |  2298.47 |
+|  4096 |     32 |    8 |  33024 |   11.463 |  2858.68 |    2.068 |   123.78 |   13.531 |  2440.65 |
+|  4096 |     32 |   16 |  66048 |   22.915 |  2859.95 |    3.018 |   169.67 |   25.933 |  2546.90 |
+|  4096 |     32 |   32 | 132096 |   45.956 |  2852.10 |    4.609 |   222.18 |   50.565 |  2612.39 |
+|  8192 |     32 |    1 |   8224 |    3.063 |  2674.72 |    0.693 |    46.20 |    3.755 |  2189.92 |
+|  8192 |     32 |    2 |  16448 |    6.109 |  2681.87 |    1.214 |    52.71 |    7.323 |  2245.98 |
+|  8192 |     32 |    4 |  32896 |   12.197 |  2686.63 |    1.682 |    76.11 |   13.878 |  2370.30 |
+|  8192 |     32 |    8 |  65792 |   24.409 |  2684.94 |    2.556 |   100.17 |   26.965 |  2439.95 |
+|  8192 |     32 |   16 | 131584 |   48.753 |  2688.50 |    3.994 |   128.20 |   52.747 |  2494.64 |
+|  8192 |     32 |   32 | 263168 |   97.508 |  2688.42 |    6.528 |   156.86 |  104.037 |  2529.57 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |       2925.55 ± 4.25 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         62.80 ± 0.27 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       2531.01 ± 6.79 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         55.86 ± 0.33 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |       2244.39 ± 5.33 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         45.95 ± 0.33 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       1783.17 ± 3.68 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         39.07 ± 0.10 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |       1241.90 ± 3.13 |
+| qwen3moe 30B.A3B Q8_0          |  30.25 GiB |    30.53 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         29.92 ± 0.06 |
+
+build: eeee367de (6989)
+
+## ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
+
+Model: https://huggingface.co/ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.211 |  2421.57 |    1.055 |    30.33 |    1.266 |   429.57 |
+|   512 |     32 |    2 |   1088 |    0.419 |  2441.34 |    1.130 |    56.65 |    1.549 |   702.32 |
+|   512 |     32 |    4 |   2176 |    0.873 |  2345.54 |    1.174 |   108.99 |    2.048 |  1062.74 |
+|   512 |     32 |    8 |   4352 |    1.727 |  2371.85 |    1.254 |   204.22 |    2.980 |  1460.19 |
+|   512 |     32 |   16 |   8704 |    3.452 |  2373.22 |    1.492 |   343.16 |    4.944 |  1760.56 |
+|   512 |     32 |   32 |  17408 |    6.916 |  2368.93 |    1.675 |   611.51 |    8.591 |  2026.36 |
+|  4096 |     32 |    1 |   4128 |    1.799 |  2277.26 |    1.084 |    29.51 |    2.883 |  1431.91 |
+|  4096 |     32 |    2 |   8256 |    3.577 |  2290.01 |    1.196 |    53.50 |    4.774 |  1729.51 |
+|  4096 |     32 |    4 |  16512 |    7.172 |  2284.36 |    1.313 |    97.50 |    8.485 |  1946.00 |
+|  4096 |     32 |    8 |  33024 |   14.341 |  2284.96 |    1.520 |   168.46 |   15.860 |  2082.18 |
+|  4096 |     32 |   16 |  66048 |   28.675 |  2285.44 |    1.983 |   258.21 |   30.658 |  2154.33 |
+|  4096 |     32 |   32 | 132096 |   57.354 |  2285.32 |    2.640 |   387.87 |   59.994 |  2201.82 |
+|  8192 |     32 |    1 |   8224 |    3.701 |  2213.75 |    1.119 |    28.59 |    4.820 |  1706.34 |
+|  8192 |     32 |    2 |  16448 |    7.410 |  2211.19 |    1.272 |    50.31 |    8.682 |  1894.56 |
+|  8192 |     32 |    4 |  32896 |   14.802 |  2213.83 |    1.460 |    87.68 |   16.261 |  2022.96 |
+|  8192 |     32 |    8 |  65792 |   29.609 |  2213.35 |    1.781 |   143.74 |   31.390 |  2095.93 |
+|  8192 |     32 |   16 | 131584 |   59.229 |  2212.96 |    2.495 |   205.17 |   61.725 |  2131.79 |
+|  8192 |     32 |   32 | 263168 |  118.449 |  2213.15 |    3.714 |   275.75 |  122.162 |  2154.25 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |       2272.74 ± 4.68 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         30.66 ± 0.02 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       2107.80 ± 9.55 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         29.71 ± 0.05 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |       1937.80 ± 6.75 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         28.86 ± 0.04 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |       1641.12 ± 1.78 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         27.24 ± 0.04 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |       1296.02 ± 2.67 |
+| qwen2 7B Q8_0                  |   7.54 GiB |     7.62 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         23.78 ± 0.03 |
+
+build: eeee367de (6989)
+
+## ggml-org/gemma-3-4b-it-qat-GGUF
+
+Model: https://huggingface.co/ggml-org/gemma-3-4b-it-qat-GGUF
+
+- `llama-batched-bench`
+
+
+main: n_kv_max = 270336, n_batch = 2048, n_ubatch = 2048, flash_attn = 1, is_pp_shared = 0, n_gpu_layers = -1, n_threads = 20, n_threads_batch = 20
+
+|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
+|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
+|   512 |     32 |    1 |    544 |    0.094 |  5434.73 |    0.394 |    81.21 |    0.488 |  1114.15 |
+|   512 |     32 |    2 |   1088 |    0.168 |  6091.68 |    0.498 |   128.52 |    0.666 |  1633.41 |
+|   512 |     32 |    4 |   2176 |    0.341 |  6010.68 |    0.542 |   236.37 |    0.882 |  2466.43 |
+|   512 |     32 |    8 |   4352 |    0.665 |  6161.46 |    0.678 |   377.74 |    1.342 |  3241.72 |
+|   512 |     32 |   16 |   8704 |    1.323 |  6193.19 |    0.902 |   567.41 |    2.225 |  3911.74 |
+|   512 |     32 |   32 |  17408 |    2.642 |  6202.03 |    1.231 |   832.03 |    3.872 |  4495.36 |
+|  4096 |     32 |    1 |   4128 |    0.701 |  5840.49 |    0.439 |    72.95 |    1.140 |  3621.23 |
+|  4096 |     32 |    2 |   8256 |    1.387 |  5906.82 |    0.574 |   111.48 |    1.961 |  4210.12 |
+|  4096 |     32 |    4 |  16512 |    2.758 |  5940.33 |    0.651 |   196.58 |    3.409 |  4843.33 |
+|  4096 |     32 |    8 |  33024 |    5.491 |  5967.56 |    0.876 |   292.40 |    6.367 |  5187.12 |
+|  4096 |     32 |   16 |  66048 |   10.978 |  5969.58 |    1.275 |   401.69 |   12.253 |  5390.38 |
+|  4096 |     32 |   32 | 132096 |   21.944 |  5972.93 |    1.992 |   514.16 |   23.936 |  5518.73 |
+|  8192 |     32 |    1 |   8224 |    1.402 |  5841.91 |    0.452 |    70.73 |    1.855 |  4434.12 |
+|  8192 |     32 |    2 |  16448 |    2.793 |  5865.34 |    0.637 |   100.55 |    3.430 |  4795.51 |
+|  8192 |     32 |    4 |  32896 |    5.564 |  5889.64 |    0.770 |   166.26 |    6.334 |  5193.95 |
+|  8192 |     32 |    8 |  65792 |   11.114 |  5896.44 |    1.122 |   228.07 |   12.237 |  5376.51 |
+|  8192 |     32 |   16 | 131584 |   22.210 |  5901.38 |    1.789 |   286.15 |   24.000 |  5482.74 |
+|  8192 |     32 |   32 | 263168 |   44.382 |  5906.56 |    3.044 |   336.38 |   47.426 |  5549.02 |
+
+
+- `llama-bench`
+
+| model                          |       size |     params | backend    | ngl | n_ubatch | fa | mmap |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | -------: | -: | ---: | --------------: | -------------------: |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |          pp2048 |      5810.04 ± 21.71 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |            tg32 |         84.54 ± 0.18 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d4096 |       5288.04 ± 3.54 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d4096 |         78.82 ± 1.37 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |  pp2048 @ d8192 |      4960.43 ± 16.64 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |    tg32 @ d8192 |         74.13 ± 0.30 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d16384 |      4495.92 ± 31.11 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d16384 |         72.37 ± 0.29 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 | pp2048 @ d32768 |      3746.90 ± 40.01 |
+| gemma3 4B Q4_0                 |   2.35 GiB |     3.88 B | CUDA       |  99 |     2048 |  1 |    0 |   tg32 @ d32768 |         63.02 ± 0.20 |
+
+build: eeee367de (6989)
+
--- a/benches/dgx-spark/run-aime-120b-t8-x8-high.log
+++ b/benches/dgx-spark/run-aime-120b-t8-x8-high.log
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -422,6 +422,7 @@ echo "Building for iOS devices..."
 cmake -B build-ios-device -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
+    -DCMAKE_SYSTEM_NAME=iOS \
    -DCMAKE_OSX_SYSROOT=iphoneos \
    -DCMAKE_OSX_ARCHITECTURES="arm64" \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
@@ -453,6 +454,8 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
+    -DLLAMA_HTTPLIB=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@@ -467,6 +470,8 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -DLLAMA_CURL=OFF \
+    -DLLAMA_HTTPLIB=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

--- a/ci/README-MUSA.md
+++ b/ci/README-MUSA.md
@@ -0,0 +1,35 @@
+## Running MUSA CI in a Docker Container
+
+Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
+
+### 1. Create a local directory to store cached models, configuration files and venv:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-cache
+```
+
+### 2. Create a local directory to store CI run results:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-results
+```
+
+### 3. Start a Docker container and run the CI:
+
+```bash
+docker run --privileged -it \
+    -v $HOME/llama.cpp/ci-cache:/ci-cache \
+    -v $HOME/llama.cpp/ci-results:/ci-results \
+    -v $PWD:/ws -w /ws \
+    mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+```
+
+Inside the container, execute the following commands:
+
+```bash
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
+git config --global --add safe.directory /ws
+GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
+```
+
+This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,18 +1,10 @@
 # CI

-In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+This CI implements heavy-duty workflows that run on self-hosted runners. Typically the purpose of these workflows is to
+cover hardware configurations that are not available from Github-hosted runners and/or require more computational
+resource than normally available.

-https://github.com/ggml-org/ci
-
-It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
-to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
-to cover various hardware architectures, including GPU and Apple Silicon instances.
-
-Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
-Only the branches of this repo are monitored for this keyword.
-
-It is a good practice, before publishing changes to execute the full CI locally on your machine:
+It is a good practice, before publishing changes to execute the full CI locally on your machine. For example:

 ```bash
 mkdir tmp
@@ -29,40 +21,13 @@ GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

 # with MUSA support
 GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# etc.
 ```

-## Running MUSA CI in a Docker Container
+# Adding self-hosted runners

-Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
-
-### 1. Create a local directory to store cached models, configuration files and venv:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-cache
-```
-
-### 2. Create a local directory to store CI run results:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-results
-```
-
-### 3. Start a Docker container and run the CI:
-
-```bash
-docker run --privileged -it \
-    -v $HOME/llama.cpp/ci-cache:/ci-cache \
-    -v $HOME/llama.cpp/ci-results:/ci-results \
-    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
-```
-
-Inside the container, execute the following commands:
-
-```bash
-apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
-git config --global --add safe.directory /ws
-GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
-```
-
-This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
+- Add a self-hosted `ggml-ci` workflow to [[.github/workflows/build.yml]] with an appropriate label
+- Request a runner token from `ggml-org` (for example, via a comment in the PR or email)
+- Set-up a machine using the received token ([docs](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners))
+- Optionally update [ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) to build and run on the target platform by gating the implementation with a `GG_BUILD_...` env
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -22,6 +22,9 @@
 # # with MUSA support
 # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with KLEIDIAI support
+# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -34,15 +37,15 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")

-rm -f "$OUT/*.log"
-rm -f "$OUT/*.exit"
-rm -f "$OUT/*.md"
+rm -f $OUT/*.log
+rm -f $OUT/*.exit
+rm -f $OUT/*.md

 sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_SCHED_NO_REALLOC=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
@@ -65,6 +68,16 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    fi
 fi

+if [ ! -z ${GG_BUILD_ROCM} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
+    if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
+        echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
+        exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
+fi
+
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
@@ -82,6 +95,12 @@ fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+
+    # if on Mac, disable METAL
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    fi
+
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@@ -93,6 +112,45 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
    MUSA_ARCH=${MUSA_ARCH:-21}
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
 fi
+
+if [ ! -z ${GG_BUILD_NO_SVE} ]; then
+    # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
+fi
+
+if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
+    echo ">>===== Enabling KleidiAI support"
+
+    CANDIDATES=(
+        "armv9-a+dotprod+i8mm+sve2"
+        "armv9-a+dotprod+i8mm"
+        "armv8.6-a+dotprod+i8mm"
+        "armv8.2-a+dotprod"
+    )
+    CPU=""
+
+    for cpu in "${CANDIDATES[@]}"; do
+        if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
+            CPU="$cpu"
+            break
+        fi
+    done
+
+    if [ -z "$CPU" ]; then
+        echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
+        exit 1
+    fi
+
+    echo ">>===== Using ARM baseline: ${CPU}"
+
+    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
+        -DGGML_NATIVE=OFF \
+        -DGGML_CPU_KLEIDIAI=ON \
+        -DGGML_CPU_AARCH64=ON \
+        -DGGML_CPU_ARM_ARCH=${CPU} \
+        -DBUILD_SHARED_LIBS=OFF"
+fi
+
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -150,7 +208,7 @@ function gg_run_ctest_debug {
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -200,33 +258,9 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

-# test_scripts_debug
+# test_scripts

-function gg_run_test_scripts_debug {
-    cd ${SRC}
-
-    set -e
-
-    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-
-    set +e
-}
-
-function gg_sum_test_scripts_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs test scripts in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
-# test_scripts_release
-
-function gg_run_test_scripts_release {
+function gg_run_test_scripts {
    cd ${SRC}

    set -e
@@ -237,10 +271,10 @@ function gg_run_test_scripts_release {
    set +e
 }

-function gg_sum_test_scripts_release {
+function gg_sum_test_scripts {
    gg_printf '### %s\n\n' "${ci}"

-    gg_printf 'Runs test scripts in release mode\n'
+    gg_printf 'Runs test scripts\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
@@ -249,15 +283,9 @@ function gg_sum_test_scripts_release {
 }

 function gg_get_model {
-    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
-    elif [[ -s $gguf_1 ]]; then
-        echo -n "$gguf_1"
-    elif [[ -s $gguf_2 ]]; then
-        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -316,24 +344,22 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

-# open_llama_7b_v2
+# qwen3_0_6b

-function gg_run_open_llama_7b_v2 {
+function gg_run_qwen3_0_6b {
    cd ${SRC}

-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
+   #gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
+

    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/

-    path_models="../models-mnt/open-llama/7B-v2"
+    path_models="../models-mnt/qwen3/0.6B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"

    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@@ -343,9 +369,11 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16

    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_bf16="${path_models}/ggml-model-bf16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
@@ -359,179 +387,51 @@ function gg_run_open_llama_7b_v2 {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
+    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        (time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    fi
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_open_llama_7b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 7B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_1.4b
-
-function gg_run_pythia_1_4b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/pythia/1.4B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -547,6 +447,9 @@ function gg_run_pythia_1_4b {
    }

    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    fi
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@@ -563,147 +466,17 @@ function gg_run_pythia_1_4b {
    set +e
 }

-function gg_sum_pythia_1_4b {
+function gg_sum_qwen3_0_6b {
    gg_printf '### %s\n\n' "${ci}"

-    gg_printf 'Pythia 1.4B:\n'
+    gg_printf 'Qwen3 0.6B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_2_8b
-
-function gg_run_pythia_2_8b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/pythia/2.8B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_2_8b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 2.8B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- f16:\n```\n%s\n```\n'  "$(cat $OUT/${ci}-tg-f16.log)"
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
+    fi
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
@@ -750,8 +523,8 @@ function gg_run_embd_bge_small {

    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0

-    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 --no-op-offload) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log

    set +e
 }
@@ -775,12 +548,7 @@ function gg_run_rerank_tiny {
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
-
-    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.json

    path_models="../models-mnt/rerank-tiny"

@@ -796,7 +564,7 @@ function gg_run_rerank_tiny {
    model_f16="${path_models}/ggml-model-f16.gguf"

    # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --no-op-offload --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log

    # sample output
    # rerank score 0:    0.029
@@ -870,6 +638,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
+
 test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

@@ -878,20 +647,15 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts_debug
-        test $ret -eq 0 && gg_run test_scripts_release
+        test $ret -eq 0 && gg_run test_scripts
    fi

-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
-            test $ret -eq 0 && gg_run pythia_1_4b
-        else
-            test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
-        test $ret -eq 0 && gg_run ctest_with_model_debug
-        test $ret -eq 0 && gg_run ctest_with_model_release
-    fi
+    test $ret -eq 0 && gg_run qwen3_0_6b
+
+    test $ret -eq 0 && gg_run ctest_with_model_debug
+    test $ret -eq 0 && gg_run ctest_with_model_release
 fi

+cat $OUT/README.md
+
 exit $ret
--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+set(CMAKE_SYSTEM_VERSION 1)
+
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
+    message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+else()
+    set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
+    if (DEFINED ENV{RISCV_ROOT_PATH})
+        file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+    else()
+        message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+    endif()
+
+    set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
+    set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
+    set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
+    set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
+    set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
+    set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+endif()
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -50,12 +50,17 @@ add_library(${TARGET} STATIC
    base64.hpp
    chat-parser.cpp
    chat-parser.h
+    chat-parser-xml-toolcall.h
+    chat-parser-xml-toolcall.cpp
    chat.cpp
    chat.h
    common.cpp
    common.h
    console.cpp
    console.h
+    download.cpp
+    download.h
+    http.h
    json-partial.cpp
    json-partial.h
    json-schema-to-grammar.cpp
@@ -76,10 +81,11 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

+# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
 set(LLAMA_COMMON_EXTRA_LIBS build_info)

-# Use curl to download model url
 if (LLAMA_CURL)
+    # Use curl to download model url
    find_package(CURL)
    if (NOT CURL_FOUND)
        message(FATAL_ERROR "Could NOT find CURL. Hint: to disable this feature, set -DLLAMA_CURL=OFF")
@@ -87,7 +93,11 @@ if (LLAMA_CURL)
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-endif ()
+elseif (LLAMA_HTTPLIB)
+    # otherwise, use cpp-httplib
+    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
+endif()

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -59,8 +59,8 @@ struct common_arg {
    common_arg & set_sparam();
    bool in_example(enum llama_example ex);
    bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
+    bool get_value_from_env(std::string & output) const;
+    bool has_value_from_env() const;
    std::string to_string();
 };

@@ -78,7 +78,6 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e

 // function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
-bool common_has_curl();

 struct common_remote_params {
    std::vector<std::string> headers;
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -0,0 +1,861 @@
+#include "chat.h"
+#include "chat-parser.h"
+#include "common.h"
+#include "json-partial.h"
+#include "json-schema-to-grammar.h"
+#include "log.h"
+#include "regex-partial.h"
+
+using json = nlohmann::ordered_json;
+
+class xml_toolcall_syntax_exception : public std::runtime_error {
+  public:
+    xml_toolcall_syntax_exception(const std::string & message) : std::runtime_error(message) {}
+};
+
+template<typename T>
+inline void sort_uniq(std::vector<T> &vec) {
+    std::sort(vec.begin(), vec.end());
+    vec.erase(std::unique(vec.begin(), vec.end()), vec.end());
+}
+
+template<typename T>
+inline bool all_space(const T &str) {
+    return std::all_of(str.begin(), str.end(), [](unsigned char ch) { return std::isspace(ch); });
+}
+
+static size_t utf8_truncate_safe(const std::string_view s) {
+    size_t len = s.size();
+    if (len == 0) return 0;
+    size_t i = len;
+    for (size_t back = 0; back < 4 && i > 0; ++back) {
+        --i;
+        unsigned char c = s[i];
+        if ((c & 0x80) == 0) {
+            return len;
+        } else if ((c & 0xC0) == 0xC0) {
+            size_t expected_len = 0;
+            if ((c & 0xE0) == 0xC0) expected_len = 2;
+            else if ((c & 0xF0) == 0xE0) expected_len = 3;
+            else if ((c & 0xF8) == 0xF0) expected_len = 4;
+            else return i;
+            if (len - i >= expected_len) {
+                return len;
+            } else {
+                return i;
+            }
+        }
+    }
+    return len - std::min(len, size_t(3));
+}
+
+inline void utf8_truncate_safe_resize(std::string &s) {
+    s.resize(utf8_truncate_safe(s));
+}
+
+inline std::string_view utf8_truncate_safe_view(const std::string_view s) {
+    return s.substr(0, utf8_truncate_safe(s));
+}
+
+static std::optional<common_chat_msg_parser::find_regex_result> try_find_2_literal_splited_by_spaces(common_chat_msg_parser & builder, const std::string & literal1, const std::string & literal2) {
+    if (literal1.size() == 0) return builder.try_find_literal(literal2);
+    const auto saved_pos = builder.pos();
+    while (auto res = builder.try_find_literal(literal1)) {
+        builder.consume_spaces();
+        const auto match_len = std::min(literal2.size(), builder.input().size() - builder.pos());
+        if (builder.input().compare(builder.pos(), match_len, literal2, 0, match_len) == 0) {
+            if (res->prelude.size() != res->groups[0].begin - saved_pos) {
+                res->prelude = builder.str({saved_pos, res->groups[0].begin});
+            }
+            builder.move_to(builder.pos() + match_len);
+            res->groups[0].end = builder.pos();
+            GGML_ASSERT(res->groups[0].begin != res->groups[0].end);
+            return res;
+        }
+        builder.move_to(res->groups[0].begin + 1);
+    }
+    builder.move_to(saved_pos);
+    return std::nullopt;
+}
+
+/**
+ * make a GBNF that accept any strings except those containing any of the forbidden strings.
+ */
+std::string make_gbnf_excluding(std::vector<std::string> forbids) {
+    constexpr auto charclass_escape = [](unsigned char c) -> std::string {
+        if (c == '\\' || c == ']' || c == '^' || c == '-') {
+            std::string s = "\\";
+            s.push_back((char)c);
+            return s;
+        }
+        if (isprint(c)) {
+            return std::string(1, (char)c);
+        }
+        char buf[16];
+        snprintf(buf, 15, "\\x%02X", c);
+        return std::string(buf);
+    };
+    constexpr auto build_expr = [charclass_escape](auto self, const std::vector<std::string>& forbids, int l, int r, int depth) -> std::string {
+        std::vector<std::pair<unsigned char, std::pair<int,int>>> children;
+        int i = l;
+        while (i < r) {
+            const std::string &s = forbids[i];
+            if ((int)s.size() == depth) {
+                ++i;
+                continue;
+            }
+            unsigned char c = (unsigned char)s[depth];
+            int j = i;
+            while (j < r && (int)forbids[j].size() > depth &&
+                   (unsigned char)forbids[j][depth] == c) {
+                ++j;
+            }
+            children.push_back({c, {i, j}});
+            i = j;
+        }
+        std::vector<std::string> alts;
+        if (!children.empty()) {
+            std::string cls;
+            for (auto &ch : children) cls += charclass_escape(ch.first);
+            alts.push_back(std::string("[^") + cls + "]");
+        }
+        for (auto &ch : children) {
+            std::string childExpr = self(self, forbids, ch.second.first, ch.second.second, depth+1);
+            if (!childExpr.empty()) {
+                std::string quoted_ch = "\"";
+                if (ch.first == '\\') quoted_ch += "\\\\";
+                else if (ch.first == '"') quoted_ch += "\\\"";
+                else if (isprint(ch.first)) quoted_ch.push_back(ch.first);
+                else {
+                    char buf[16];
+                    snprintf(buf, 15, "\\x%02X", ch.first);
+                    quoted_ch += buf;
+                }
+                quoted_ch += "\"";
+                std::string branch = quoted_ch + std::string(" ") + childExpr;
+                alts.push_back(branch);
+            }
+        }
+        if (alts.empty()) return "";
+        std::ostringstream oss;
+        oss << "( ";
+        for (size_t k = 0; k < alts.size(); ++k) {
+            if (k) oss << " | ";
+            oss << alts[k];
+        }
+        oss << " )";
+        return oss.str();
+    };
+    if (forbids.empty()) return "( . )*";
+    sort(forbids.begin(), forbids.end());
+    std::string expr = build_expr(build_expr, forbids, 0, forbids.size(), 0);
+    if (expr.empty()) {
+        std::string cls;
+        for (auto &s : forbids) if (!s.empty()) cls += charclass_escape((unsigned char)s[0]);
+        expr = std::string("( [^") + cls + "] )";
+    }
+    if (forbids.size() == 1)
+        return expr + "*";
+    else
+        return std::string("( ") + expr + " )*";
+}
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const json & tools, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.tool_sep.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+
+    std::string key_val_sep = form.key_val_sep;
+    if (form.key_val_sep2) {
+        key_val_sep += "\n";
+        key_val_sep += *form.key_val_sep2;
+    }
+    GGML_ASSERT(!key_val_sep.empty());
+
+    if (tools.is_array() && !tools.empty()) {
+        data.grammar = build_grammar([&](const common_grammar_builder &builder) {
+            auto string_arg_val = form.last_val_end ?
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end, *form.last_val_end})) :
+                    builder.add_rule("string-arg-val", make_gbnf_excluding({form.val_end}));
+
+            std::vector<std::string> tool_rules;
+            for (const auto & tool : tools) {
+                if (!tool.contains("type") || tool.at("type") != "function" || !tool.contains("function")) {
+                    LOG_WRN("Skipping tool without function: %s", tool.dump(2).c_str());
+                    continue;
+                }
+                const auto & function = tool.at("function");
+                if (!function.contains("name") || !function.at("name").is_string()) {
+                    LOG_WRN("Skipping invalid function (invalid name): %s", function.dump(2).c_str());
+                    continue;
+                }
+                if (!function.contains("parameters") || !function.at("parameters").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid parameters): %s", function.dump(2).c_str());
+                    continue;
+                }
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+
+                struct parameter_rule {
+                    std::string symbol_name;
+                    bool is_required;
+                };
+                std::vector<parameter_rule> arg_rules;
+                if (!parameters.contains("properties") || !parameters.at("properties").is_object()) {
+                    LOG_WRN("Skipping invalid function (invalid properties): %s", function.dump(2).c_str());
+                    continue;
+                } else {
+                    std::vector<std::string> requiredParameters;
+                    if (parameters.contains("required")) {
+                        try { parameters.at("required").get_to(requiredParameters); }
+                        catch (const std::runtime_error&) {
+                            LOG_WRN("Invalid function required parameters, ignoring: %s", function.at("required").dump(2).c_str());
+                        }
+                    }
+                    sort_uniq(requiredParameters);
+                    for (const auto & [key, value] : parameters.at("properties").items()) {
+                        std::string quoted_key = key;
+                        bool required = std::binary_search(requiredParameters.begin(), requiredParameters.end(), key);
+                        if (form.key_start.back() == '"' && key_val_sep[0] == '"') {
+                            quoted_key = gbnf_format_literal(key);
+                            quoted_key = quoted_key.substr(1, quoted_key.size() - 2);
+                        }
+                        arg_rules.push_back(parameter_rule {builder.add_rule("func-" + name + "-kv-" + key,
+                            gbnf_format_literal(form.key_start) + " " +
+                            gbnf_format_literal(quoted_key) + " " +
+                            gbnf_format_literal(key_val_sep) + " " +
+                            ((value.contains("type") && value["type"].is_string() && value["type"] == "string" && (!form.raw_argval || *form.raw_argval)) ?
+                                    (form.raw_argval ?
+                                            string_arg_val :
+                                            "( " + string_arg_val + " | " + builder.add_schema(name + "-arg-" + key, value) + " )"
+                                    ) :
+                                    builder.add_schema(name + "-arg-" + key, value)
+                            )
+                        ), required});
+                    }
+                }
+
+                auto next_arg_with_sep = builder.add_rule(name + "-last-arg-end", form.last_val_end ? gbnf_format_literal(*form.last_val_end) : gbnf_format_literal(form.val_end));
+                decltype(next_arg_with_sep) next_arg = "\"\"";
+                for (auto i = arg_rules.size() - 1; /* i >= 0 && */ i < arg_rules.size(); --i) {
+                    std::string include_this_arg = arg_rules[i].symbol_name + " " + next_arg_with_sep;
+                    next_arg = builder.add_rule(name + "-arg-after-" + std::to_string(i), arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg
+                    );
+                    include_this_arg = gbnf_format_literal(form.val_end) + " " + include_this_arg;
+                    next_arg_with_sep = builder.add_rule(name + "-arg-after-" + std::to_string(i) + "-with-sep", arg_rules[i].is_required ?
+                            include_this_arg : "( " + include_this_arg + " ) | " + next_arg_with_sep
+                    );
+                }
+
+                std::string quoted_name = name;
+                if (form.tool_start.back() == '"' && form.tool_sep[0] == '"') {
+                    quoted_name = gbnf_format_literal(name);
+                    quoted_name = quoted_name.substr(1, quoted_name.size() - 2);
+                }
+                quoted_name = gbnf_format_literal(quoted_name);
+                // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+                if (data.format == COMMON_CHAT_FORMAT_KIMI_K2) {
+                    quoted_name = "\"functions.\" " + quoted_name + " \":\" [0-9]+";
+                }
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                        gbnf_format_literal(form.tool_start) + " " +
+                        quoted_name + " " +
+                        gbnf_format_literal(form.tool_sep) + " " +
+                        next_arg
+                ));
+            }
+
+            auto tool_call_once = builder.add_rule("root-tool-call-once", string_join(tool_rules, " | "));
+            auto tool_call_more = builder.add_rule("root-tool-call-more", gbnf_format_literal(form.tool_end) + " " + tool_call_once);
+            auto call_end = builder.add_rule("root-call-end", form.last_tool_end ? gbnf_format_literal(*form.last_tool_end) : gbnf_format_literal(form.tool_end));
+            auto tool_call_multiple_with_end = builder.add_rule("root-tool-call-multiple-with-end", tool_call_once + " " + tool_call_more + "* " + call_end);
+            builder.add_rule("root",
+                (form.scope_start.empty() ? "" : gbnf_format_literal(form.scope_start) + " ") +
+                tool_call_multiple_with_end  + "?" +
+                (form.scope_end.empty() ? "" : " " + gbnf_format_literal(form.scope_end))
+            );
+        });
+
+        // grammar trigger for tool call
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, form.scope_start + form.tool_start });
+    }
+}
+
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * Throws xml_toolcall_syntax_exception if there is invalid syntax and cannot recover the original status for common_chat_msg_parser.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+inline bool parse_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form) {
+    GGML_ASSERT(!form.tool_start.empty());
+    GGML_ASSERT(!form.key_start.empty());
+    GGML_ASSERT(!form.key_val_sep.empty());
+    GGML_ASSERT(!form.val_end.empty());
+    GGML_ASSERT(!form.tool_end.empty());
+
+    // Helper to choose return false or throw error
+    constexpr auto return_error = [](common_chat_msg_parser & builder, auto &start_pos, const bool &recovery) {
+        LOG_DBG("Failed to parse XML-Style tool call at position: %s\n", gbnf_format_literal(builder.consume_rest().substr(0, 20)).c_str());
+        if (recovery) {
+            builder.move_to(start_pos);
+            return false;
+        } else throw xml_toolcall_syntax_exception("Tool call parsing failed with unrecoverable errors. Try using a grammar to constrain the model’s output.");
+    };
+    // Drop substring from needle to end from a JSON
+    constexpr auto partial_json = [](std::string &json_str, std::string_view needle = "XML_TOOL_CALL_PARTIAL_FLAG") {
+        auto pos = json_str.rfind(needle);
+        if (pos == std::string::npos) {
+            return false;
+        }
+        for (auto i = pos + needle.size(); i < json_str.size(); ++i) {
+            unsigned char ch = static_cast<unsigned char>(json_str[i]);
+            if (ch != '\'' && ch != '"' && ch != '}' && ch != ':' && !std::isspace(ch)) {
+                return false;
+            }
+        }
+        if (pos != 0 && json_str[pos - 1] == '"') {
+            --pos;
+        }
+        json_str.resize(pos);
+        return true;
+    };
+    // Helper to generate a partial argument JSON
+    constexpr auto gen_partial_json = [partial_json](auto set_partial_arg, auto &arguments, auto &builder, auto &function_name) {
+        auto rest = builder.consume_rest();
+        utf8_truncate_safe_resize(rest);
+        set_partial_arg(rest, "XML_TOOL_CALL_PARTIAL_FLAG");
+        auto tool_str = arguments.dump();
+        if (partial_json(tool_str)) {
+            if (builder.add_tool_call(function_name, "", tool_str)) {
+                return;
+            }
+        }
+        LOG_DBG("Failed to parse partial XML-Style tool call, fallback to non-partial: %s\n", tool_str.c_str());
+    };
+    // Helper to find a close (because there may be form.last_val_end or form.last_tool_end)
+    constexpr auto try_find_close = [](
+            common_chat_msg_parser & builder,
+            const std::string & end,
+            const std::optional<std::string> & alt_end,
+            const std::string & end_next,
+            const std::optional<std::string> & alt_end_next
+    ) {
+        auto saved_pos = builder.pos();
+        auto tc = builder.try_find_literal(end);
+        auto val_end_size = end.size();
+        if (alt_end) {
+            auto pos_1 = builder.pos();
+            builder.move_to(saved_pos);
+            auto tc2 = try_find_2_literal_splited_by_spaces(builder, *alt_end, end_next);
+            if (alt_end_next) {
+                builder.move_to(saved_pos);
+                auto tc3 = try_find_2_literal_splited_by_spaces(builder, *alt_end, *alt_end_next);
+                if (tc3 && (!tc2 || tc2->prelude.size() > tc3->prelude.size())) {
+                    tc2 = tc3;
+                }
+            }
+            if (tc2 && (!tc || tc->prelude.size() > tc2->prelude.size())) {
+                tc = tc2;
+                tc->groups[0].end = std::min(builder.input().size(), tc->groups[0].begin + alt_end->size());
+                builder.move_to(tc->groups[0].end);
+                val_end_size = alt_end->size();
+            } else {
+                builder.move_to(pos_1);
+            }
+        }
+        return std::make_pair(val_end_size, tc);
+    };
+    // Helper to find a val_end or last_val_end, returns matched pattern size
+    const auto try_find_val_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.val_end, form.last_val_end, form.tool_end, form.last_tool_end);
+    };
+    // Helper to find a tool_end or last_tool_end, returns matched pattern size
+    const auto try_find_tool_end = [try_find_close, &builder, &form]() {
+        return try_find_close(builder, form.tool_end, form.last_tool_end, form.scope_end, std::nullopt);
+    };
+
+    bool recovery = true;
+    const auto start_pos = builder.pos();
+    if (!all_space(form.scope_start)) {
+        if (auto tc = builder.try_find_literal(form.scope_start)) {
+            if (all_space(tc->prelude)) {
+                if (form.scope_start.size() != tc->groups[0].end - tc->groups[0].begin)
+                    throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.scope_start));
+            } else {
+                builder.move_to(start_pos);
+                return false;
+            }
+        } else return false;
+    }
+    while (auto tc = builder.try_find_literal(form.tool_start)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                    gbnf_format_literal(form.tool_start).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            builder.move_to(tc->groups[0].begin - tc->prelude.size());
+            break;
+        }
+
+        // Find tool name
+        auto func_name = builder.try_find_literal(all_space(form.tool_sep) ? form.key_start : form.tool_sep);
+        if (!func_name) {
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+        if (!func_name) {
+            // Partial tool name not supported
+            throw common_chat_msg_partial_exception("incomplete tool_call");
+        }
+        // If the model generate multiple tool call and the first tool call has no argument
+        if (func_name->prelude.find(form.tool_end) != std::string::npos || (form.last_tool_end ? func_name->prelude.find(*form.last_tool_end) != std::string::npos : false)) {
+            builder.move_to(func_name->groups[0].begin - func_name->prelude.size());
+            auto [sz, tc] = try_find_tool_end();
+            func_name = tc;
+        }
+
+        // Parse tool name
+        builder.move_to(all_space(form.tool_sep) ? func_name->groups[0].begin : func_name->groups[0].end);
+        std::string function_name = string_strip(func_name->prelude);
+        // Kimi-K2 uses functions.{{ tool_call['function']['name'] }}:{{ loop.index }} as function name
+        if (builder.syntax().format == COMMON_CHAT_FORMAT_KIMI_K2) {
+            if (string_starts_with(function_name, "functions.")) {
+                static const std::regex re(":\\d+$");
+                if (std::regex_search(function_name, re)) {
+                    function_name = function_name.substr(10, function_name.rfind(":") - 10);
+                }
+            }
+        }
+
+        // Argument JSON
+        json arguments = json::object();
+
+        // Helper to generate a partial argument JSON
+        const auto gen_partial_args = [&](auto set_partial_arg) {
+            gen_partial_json(set_partial_arg, arguments, builder, function_name);
+        };
+
+        // Parse all arg_key/arg_value pairs
+        while (auto tc = builder.try_find_literal(form.key_start)) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("XML-Style tool call: Expected %s, but found %s, trying to match next pattern\n",
+                        gbnf_format_literal(form.key_start).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                builder.move_to(tc->groups[0].begin - tc->prelude.size());
+                break;
+            }
+            if (tc->groups[0].end - tc->groups[0].begin != form.key_start.size()) {
+                auto tool_call_arg = arguments.dump();
+                if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+                    tool_call_arg.resize(tool_call_arg.size() - 1);
+                }
+                builder.add_tool_call(function_name, "", tool_call_arg);
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_start));
+            }
+
+            // Parse arg_key
+            auto key_res = builder.try_find_literal(form.key_val_sep);
+            if (!key_res) {
+                gen_partial_args([&](auto &rest, auto &needle) {arguments[rest + needle] = "";});
+                throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.key_val_sep) + " after " + gbnf_format_literal(form.key_start));
+            }
+            if (key_res->groups[0].end - key_res->groups[0].begin != form.key_val_sep.size()) {
+                gen_partial_args([&](auto &, auto &needle) {arguments[key_res->prelude + needle] = "";});
+                throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.key_val_sep));
+            }
+            auto &key = key_res->prelude;
+            recovery = false;
+
+            // Parse arg_value
+            if (form.key_val_sep2) {
+                if (auto tc = builder.try_find_literal(*form.key_val_sep2)) {
+                    if (!all_space(tc->prelude)) {
+                        LOG_DBG("Failed to parse XML-Style tool call: Unexcepted %s between %s and %s\n",
+                                gbnf_format_literal(tc->prelude).c_str(),
+                                gbnf_format_literal(form.key_val_sep).c_str(),
+                                gbnf_format_literal(*form.key_val_sep2).c_str()
+                        );
+                        return return_error(builder, start_pos, false);
+                    }
+                    if (tc->groups[0].end - tc->groups[0].begin != form.key_val_sep2->size()) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(*form.key_val_sep2));
+                    }
+                } else {
+                    gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(*form.key_val_sep2) + " after " + gbnf_format_literal(form.key_val_sep));
+                }
+            }
+            auto val_start = builder.pos();
+
+            // Test if arg_val is a partial JSON
+            std::optional<common_json> value_json = std::nullopt;
+            if (!form.raw_argval || !*form.raw_argval) {
+                try { value_json = builder.try_consume_json(); }
+                catch (const std::runtime_error&) { builder.move_to(val_start); }
+                // TODO: Delete this when json_partial adds top-level support for null/true/false
+                if (builder.pos() == val_start) {
+                    const static std::regex number_regex(R"([0-9-][0-9]*(\.\d*)?([eE][+-]?\d*)?)");
+                    builder.consume_spaces();
+                    std::string_view sv = utf8_truncate_safe_view(builder.input());
+                    sv.remove_prefix(builder.pos());
+                    std::string rest = "a";
+                    if (sv.size() < 6) rest = sv;
+                    if (string_starts_with("null", rest) || string_starts_with("true", rest) || string_starts_with("false", rest) || std::regex_match(sv.begin(), sv.end(), number_regex)) {
+                        value_json = {123, {"123", "123"}};
+                        builder.consume_rest();
+                    } else {
+                        builder.move_to(val_start);
+                    }
+                }
+            }
+
+            // If it is a JSON and followed by </arg_value>, parse as json
+            // cannot support streaming because it may be a plain text starting with JSON
+            if (value_json) {
+                auto json_end = builder.pos();
+                builder.consume_spaces();
+                if (builder.pos() == builder.input().size()) {
+                    if (form.raw_argval && !*form.raw_argval && (value_json->json.is_string() || value_json->json.is_object() || value_json->json.is_array())) {
+                        arguments[key] = value_json->json;
+                        auto json_str = arguments.dump();
+                        if (!value_json->healing_marker.json_dump_marker.empty()) {
+                            GGML_ASSERT(std::string::npos != json_str.rfind(value_json->healing_marker.json_dump_marker));
+                            json_str.resize(json_str.rfind(value_json->healing_marker.json_dump_marker));
+                        } else {
+                            GGML_ASSERT(json_str.back() == '}');
+                            json_str.resize(json_str.size() - 1);
+                        }
+                        builder.add_tool_call(function_name, "", json_str);
+                    } else {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                    }
+                    LOG_DBG("Possible JSON arg_value: %s\n", value_json->json.dump().c_str());
+                    throw common_chat_msg_partial_exception("JSON arg_value detected. Waiting for more tokens for validations.");
+                }
+                builder.move_to(json_end);
+                auto [val_end_size, tc] = try_find_val_end();
+                if (tc && all_space(tc->prelude) && value_json->healing_marker.marker.empty()) {
+                    if (tc->groups[0].end - tc->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = needle;});
+                        LOG_DBG("Possible terminated JSON arg_value: %s\n", value_json->json.dump().c_str());
+                        throw common_chat_msg_partial_exception("Partial literal: " + gbnf_format_literal(form.val_end) + (form.last_val_end ? gbnf_format_literal(*form.last_val_end) : ""));
+                    } else arguments[key] = value_json->json;
+                } else builder.move_to(val_start);
+            }
+
+            // If not, parse as plain text
+            if (val_start == builder.pos()) {
+                if (auto [val_end_size, value_plain] = try_find_val_end(); value_plain) {
+                    auto &value_str = value_plain->prelude;
+                    if (form.trim_raw_argval) value_str = string_strip(value_str);
+                    if (value_plain->groups[0].end - value_plain->groups[0].begin != val_end_size) {
+                        gen_partial_args([&](auto &, auto &needle) {arguments[key] = value_str + needle;});
+                        throw common_chat_msg_partial_exception(
+                                "Expected " + gbnf_format_literal(form.val_end) +
+                                " after " + gbnf_format_literal(form.key_val_sep) +
+                                (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                        );
+                    }
+                    arguments[key] = value_str;
+                } else {
+                    if (form.trim_raw_argval) {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = string_strip(rest) + needle;});
+                    } else {
+                        gen_partial_args([&](auto &rest, auto &needle) {arguments[key] = rest + needle;});
+                    }
+                    throw common_chat_msg_partial_exception(
+                            "Expected " + gbnf_format_literal(form.val_end) +
+                            " after " + gbnf_format_literal(form.key_val_sep) +
+                            (form.key_val_sep2 ? " " + gbnf_format_literal(*form.key_val_sep2) : "")
+                    );
+                }
+            }
+        }
+
+        // Consume closing tag
+        if (auto [tool_end_size, tc] = try_find_tool_end(); tc) {
+            if (!all_space(tc->prelude)) {
+                LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                        gbnf_format_literal(form.tool_end).c_str(),
+                        gbnf_format_literal(tc->prelude).c_str()
+                );
+                return return_error(builder, start_pos, recovery);
+            }
+            if (tc->groups[0].end - tc->groups[0].begin == tool_end_size) {
+                // Add the parsed tool call
+                if (!builder.add_tool_call(function_name, "", arguments.dump())) {
+                    throw common_chat_msg_partial_exception("Failed to add XML-Style tool call");
+                }
+                recovery = false;
+                continue;
+            }
+        }
+
+        auto tool_call_arg = arguments.dump();
+        if (tool_call_arg.size() != 0 && tool_call_arg[tool_call_arg.size() - 1] == '}') {
+            tool_call_arg.resize(tool_call_arg.size() - 1);
+        }
+        builder.add_tool_call(function_name, "", tool_call_arg);
+        throw common_chat_msg_partial_exception("Expected " + gbnf_format_literal(form.tool_end) + " after " + gbnf_format_literal(form.val_end));
+    }
+    if (auto tc = builder.try_find_literal(form.scope_end)) {
+        if (!all_space(tc->prelude)) {
+            LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                    gbnf_format_literal(form.scope_end).c_str(),
+                    gbnf_format_literal(tc->prelude).c_str()
+            );
+            return return_error(builder, start_pos, recovery);
+        }
+    } else {
+        if (all_space(form.scope_end)) return true;
+        builder.consume_spaces();
+        if (builder.pos() == builder.input().size())
+            throw common_chat_msg_partial_exception("incomplete tool calls");
+        LOG_DBG("Failed to parse XML-Style tool call: Expected %s, but found %s\n",
+                gbnf_format_literal(form.scope_end).c_str(),
+                gbnf_format_literal(builder.consume_rest()).c_str()
+        );
+        return return_error(builder, start_pos, recovery);
+    }
+
+    return true;
+}
+
+/**
+ * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+ * May cause std::runtime_error if there is invalid syntax because partial valid tool call is already sent out to client.
+ * form.scope_start, form.tool_sep and form.scope_end can be empty.
+ */
+bool common_chat_msg_parser::try_consume_xml_tool_calls(const struct xml_tool_call_format & form) {
+    auto pos = pos_;
+    auto tsize = result_.tool_calls.size();
+    try { return parse_xml_tool_calls(*this, form); }
+    catch (const xml_toolcall_syntax_exception&) {}
+    move_to(pos);
+    result_.tool_calls.resize(tsize);
+    return false;
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
+ */
+inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>") {
+    constexpr auto rstrip = [](std::string &s) {
+        s.resize(std::distance(s.begin(), std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base()));
+    };
+    // Erase substring from l to r, along with additional spaces nearby
+    constexpr auto erase_spaces = [](auto &str, size_t l, size_t r) {
+        while (/* l > -1 && */ --l < str.size() && std::isspace(static_cast<unsigned char>(str[l])));
+        ++l;
+        while (++r < str.size() && std::isspace(static_cast<unsigned char>(str[r])));
+        if (l < r) str[l] = '\n';
+        if (l + 1 < r) str[l + 1] = '\n';
+        if (l != 0) l += 2;
+        str.erase(l, r - l);
+        return l;
+    };
+    constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
+        auto best_match = content.size();
+        for (auto pattern: list) {
+            if (pattern.size() == 0) continue;
+            for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
+                auto match_len = content.size() - match_idx;
+                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
+                    best_match = match_idx;
+                }
+            }
+        }
+        if (content.size() > best_match) {
+            content.erase(best_match);
+        }
+    };
+    const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
+        return trim_suffix(content, {
+            start_think, end_think, form.scope_start, form.tool_start, form.tool_sep, form.key_start,
+            form.key_val_sep, form.key_val_sep2 ? form.key_val_sep2->c_str() : "",
+            form.val_end, form.last_val_end ? form.last_val_end->c_str() : "",
+            form.tool_end, form.last_tool_end ? form.last_tool_end->c_str() : "",
+            form.scope_end
+        });
+    };
+
+
+    // Trim leading spaces without affecting keyword matching
+    static const common_regex spaces_regex("\\s*");
+    {
+        auto tc = builder.consume_regex(spaces_regex);
+        auto spaces = builder.str(tc.groups[0]);
+        auto s1 = spaces.size();
+        trim_potential_partial_word(spaces);
+        auto s2 = spaces.size();
+        builder.move_to(builder.pos() - (s1 - s2));
+    }
+
+    // Parse content
+    bool reasoning_unclosed = builder.syntax().thinking_forced_open;
+    std::string unclosed_reasoning_content("");
+    for (;;) {
+        auto tc = try_find_2_literal_splited_by_spaces(builder, form.scope_start, form.tool_start);
+        std::string content;
+        std::string tool_call_start;
+
+        if (tc) {
+            content = std::move(tc->prelude);
+            tool_call_start = builder.str(tc->groups[0]);
+            LOG_DBG("Matched tool start: %s\n", gbnf_format_literal(tool_call_start).c_str());
+        } else {
+            content = builder.consume_rest();
+            utf8_truncate_safe_resize(content);
+        }
+
+        // Handle unclosed think block
+        if (reasoning_unclosed) {
+            if (auto pos = content.find(end_think); pos == std::string::npos && builder.pos() != builder.input().size()) {
+                unclosed_reasoning_content += content;
+                if (form.allow_toolcall_in_think) {
+                    builder.move_to(tc->groups[0].begin);
+                    if (!builder.try_consume_xml_tool_calls(form)) {
+                        unclosed_reasoning_content += tool_call_start;
+                        builder.move_to(tc->groups[0].end);
+                    }
+                } else {
+                    unclosed_reasoning_content += tool_call_start;
+                }
+                continue;
+            } else {
+                reasoning_unclosed = false;
+                std::string reasoning_content;
+                if (pos == std::string::npos) {
+                    reasoning_content = std::move(content);
+                } else {
+                    reasoning_content = content.substr(0, pos);
+                    content.erase(0, pos + end_think.size());
+                }
+                if (builder.pos() == builder.input().size() && all_space(content)) {
+                    rstrip(reasoning_content);
+                    trim_potential_partial_word(reasoning_content);
+                    rstrip(reasoning_content);
+                    if (reasoning_content.empty()) {
+                        rstrip(unclosed_reasoning_content);
+                        trim_potential_partial_word(unclosed_reasoning_content);
+                        rstrip(unclosed_reasoning_content);
+                        if (unclosed_reasoning_content.empty()) continue;
+                    }
+                }
+                if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
+                    builder.add_content(start_think);
+                    builder.add_content(unclosed_reasoning_content);
+                    builder.add_content(reasoning_content);
+                    if (builder.pos() != builder.input().size() || !all_space(content))
+                        builder.add_content(end_think);
+                } else {
+                    builder.add_reasoning_content(unclosed_reasoning_content);
+                    builder.add_reasoning_content(reasoning_content);
+                }
+                unclosed_reasoning_content.clear();
+            }
+        }
+
+        // Handle multiple think block
+        bool toolcall_in_think = false;
+        for (auto think_start = content.find(start_think); think_start != std::string::npos; think_start = content.find(start_think, think_start)) {
+            if (auto think_end = content.find(end_think, think_start + start_think.size()); think_end != std::string::npos) {
+                if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+                    auto reasoning_content = content.substr(think_start + start_think.size(), think_end - think_start - start_think.size());
+                    builder.add_reasoning_content(reasoning_content);
+                    think_start = erase_spaces(content, think_start, think_end + end_think.size() - 1);
+                } else {
+                    think_start = think_end + end_think.size() - 1;
+                }
+            } else {
+                // This <tool_call> start is in thinking block, skip this tool call
+                auto pos = think_start + start_think.size();
+                unclosed_reasoning_content = content.substr(pos) + tool_call_start;
+                reasoning_unclosed = true;
+                content.resize(think_start);
+                toolcall_in_think = true;
+            }
+        }
+
+        if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content) {
+            rstrip(content);
+            // Handle unclosed </think> token from content: delete all </think> token
+            if (auto pos = content.rfind(end_think); pos != std::string::npos) {
+                while (pos != std::string::npos) {
+                    pos = erase_spaces(content, pos, pos + end_think.size() - 1);
+                    pos = content.rfind(end_think, pos);
+                }
+            }
+            // Strip if needed
+            if (content.size() > 0 && std::isspace(static_cast<unsigned char>(content[0]))) {
+                content = string_strip(content);
+            }
+        }
+
+        // remove potential partial suffix
+        if (content.size() > 0 && builder.pos() == builder.input().size() && unclosed_reasoning_content.empty()) {
+            rstrip(content);
+            trim_potential_partial_word(content);
+            rstrip(content);
+        }
+
+        // Add content
+        if (content.size() != 0) {
+            // If there are multiple content blocks
+            if (builder.syntax().reasoning_format != COMMON_REASONING_FORMAT_NONE && !builder.syntax().reasoning_in_content && builder.result().content.size() != 0) {
+                builder.add_content("\n\n");
+            }
+            builder.add_content(content);
+        }
+
+        // This <tool_call> start is in thinking block, skip this tool call
+        if (toolcall_in_think && !form.allow_toolcall_in_think) {
+            continue;
+        }
+
+        // There is no tool call and all content is parsed
+        if (!tc) {
+            GGML_ASSERT(builder.pos() == builder.input().size());
+            GGML_ASSERT(unclosed_reasoning_content.empty());
+            GGML_ASSERT(!reasoning_unclosed);
+            break;
+        }
+
+        builder.move_to(tc->groups[0].begin);
+        if (builder.try_consume_xml_tool_calls(form)) {
+            auto end_of_tool = builder.pos();
+            builder.consume_spaces();
+            if (builder.pos() != builder.input().size()) {
+                builder.move_to(end_of_tool);
+                if (!builder.result().content.empty()) {
+                    builder.add_content("\n\n");
+                }
+            }
+        } else {
+            static const common_regex next_char_regex(".");
+            auto c = builder.str(builder.consume_regex(next_char_regex).groups[0]);
+            rstrip(c);
+            builder.add_content(c);
+        }
+    }
+}
+
+/**
+ * Parse content uses reasoning and XML-Style tool call
+ * TODO: Note that form.allow_toolcall_in_think is not tested yet. If anyone confirms it works, this comment can be removed.
+ */
+void common_chat_msg_parser::consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think, const std::string & end_think) {
+    parse_msg_with_xml_tool_calls(*this, form, start_think, end_think);
+}
--- a/common/chat-parser-xml-toolcall.h
+++ b/common/chat-parser-xml-toolcall.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "chat.h"
+
+#include <nlohmann/json.hpp>
+
+#include <optional>
+#include <string>
+#include <vector>
+
+
+// Sample config:
+// MiniMax-M2 (left): <minimax:tool_call>\n<invoke name="tool-name">\n<parameter name="key">value</parameter>\n...</invoke>\n...</minimax:tool_call>
+// GLM 4.5   (right): <tool_call>function_name\n<arg_key>key</arg_key>\n<arg_value>value</arg_value>\n</tool_call>
+struct xml_tool_call_format {
+    std::string scope_start; // <minimax:tool_call>\n  // \n                      // can be empty
+    std::string tool_start;  // <invoke name=\"        // <tool_call>
+    std::string tool_sep;    // \">\n                  // \n                      // can be empty only for parse_xml_tool_calls
+    std::string key_start;   // <parameter name=\"     // <arg_key>
+    std::string key_val_sep; // \">                    // </arg_key>\n<arg_value>
+    std::string val_end;     // </parameter>\n         // </arg_value>\n
+    std::string tool_end;    // </invoke>\n            // </tool_call>\n
+    std::string scope_end;   // </minimax:tool_call>   //                         // can be empty
+    // Set this if there can be dynamic spaces inside key_val_sep.
+    // e.g. key_val_sep=</arg_key> key_val_sep2=<arg_value> for GLM4.5
+    std::optional<std::string> key_val_sep2 = std::nullopt;
+    // Set true if argval should only be raw string. e.g. Hello "world" hi
+    // Set false if argval should only be json string. e.g. "Hello \"world\" hi"
+    // Defaults to std::nullopt, both will be allowed.
+    std::optional<bool> raw_argval = std::nullopt;
+    std::optional<std::string> last_val_end = std::nullopt;
+    std::optional<std::string> last_tool_end = std::nullopt;
+    bool trim_raw_argval = false;
+    bool allow_toolcall_in_think = false; // TODO: UNTESTED!!!
+};
+
+// make a GBNF that accept any strings except those containing any of the forbidden strings.
+std::string make_gbnf_excluding(std::vector<std::string> forbids);
+
+/**
+ * Build grammar for xml-style tool call
+ * form.scope_start and form.scope_end can be empty.
+ * Requires data.format for model-specific hacks.
+ */
+void build_grammar_xml_tool_call(common_chat_params & data, const nlohmann::ordered_json & tools, const struct xml_tool_call_format & form);
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -1,6 +1,7 @@
 #pragma once

 #include "chat.h"
+#include "chat-parser-xml-toolcall.h"
 #include "json-partial.h"
 #include "regex-partial.h"

@@ -64,6 +65,9 @@ class common_chat_msg_parser {
    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
    bool add_tool_calls(const nlohmann::ordered_json & arr);

+    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
+    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
+
    void finish();

    bool consume_spaces();
@@ -116,5 +120,14 @@ class common_chat_msg_parser {
        const std::vector<std::vector<std::string>> & content_paths = {}
    );

+    /**
+     * Parse XML-Style tool call for given xml_tool_call_format. Return false for invalid syntax and get the position untouched.
+     * form.scope_start, form.tool_sep and form.scope_end can be empty.
+     */
+    bool try_consume_xml_tool_calls(const struct xml_tool_call_format & form);
+
+    // Parse content uses reasoning and XML-Style tool call
+    void consume_reasoning_with_xml_tool_calls(const struct xml_tool_call_format & form, const std::string & start_think = "<think>", const std::string & end_think = "</think>");
+
    void clear_tools();
 };
--- a/common/chat.cpp
+++ b/common/chat.cpp
--- a/common/chat.h
+++ b/common/chat.h
@@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
 struct common_chat_msg {
    std::string role;
    std::string content;
-    std::vector<common_chat_msg_content_part> content_parts = {};
-    std::vector<common_chat_tool_call> tool_calls = {};
+    std::vector<common_chat_msg_content_part> content_parts;
+    std::vector<common_chat_tool_call> tool_calls;
    std::string reasoning_content;
    std::string tool_name;
    std::string tool_call_id;
@@ -44,7 +44,7 @@ struct common_chat_msg {
    bool empty() const {
        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
    }
-    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+    void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
                auto id = tool_calls[i].id;
@@ -101,6 +101,7 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_MAGISTRAL,
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
@@ -114,6 +115,14 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
+    COMMON_CHAT_FORMAT_APERTUS,
+    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,
+    COMMON_CHAT_FORMAT_GLM_4_5,
+    COMMON_CHAT_FORMAT_MINIMAX_M2,
+    COMMON_CHAT_FORMAT_KIMI_K2,
+    COMMON_CHAT_FORMAT_QWEN3_CODER_XML,
+    COMMON_CHAT_FORMAT_APRIEL_1_5,
+    COMMON_CHAT_FORMAT_XIAOMI_MIMO,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -8,12 +8,14 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "sampling.h"

 #include <algorithm>
 #include <cinttypes>
 #include <climits>
 #include <cmath>
 #include <codecvt>
+#include <chrono>
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
@@ -25,7 +27,6 @@
 #include <sstream>
 #include <string>
 #include <thread>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>

@@ -50,10 +51,23 @@
 #include <unistd.h>
 #endif

+#if defined(__linux__)
+#include <sys/types.h>
+#include <pwd.h>
+#endif
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+common_time_meas::~common_time_meas() {
+    if (t_start_us >= 0) {
+        t_acc += ggml_time_us() - t_start_us;
+    }
+}
+
 //
 // CPU utils
 //
@@ -349,11 +363,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
 }

 void common_init() {
-    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
-        if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
-            common_log_add(common_log_main(), level, "%s", text);
-        }
-    }, NULL);
+    llama_log_set(common_log_default_callback, NULL);

 #ifdef NDEBUG
    const char * build_type = "";
@@ -864,8 +874,20 @@ std::string fs_get_cache_directory() {
 #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
-        } else {
+        } else if (std::getenv("HOME")) {
            cache_directory = std::getenv("HOME") + std::string("/.cache/");
+        } else {
+#if defined(__linux__)
+            /* no $HOME is defined, fallback to getpwuid */
+            struct passwd *pw = getpwuid(getuid());
+            if ((!pw) || (!pw->pw_dir)) {
+                throw std::runtime_error("Failed to find $HOME directory");
+            }
+
+            cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
+#else /* defined(__linux__) */
+            throw std::runtime_error("Failed to find $HOME directory");
+#endif /* defined(__linux__) */
        }
 #elif defined(__APPLE__)
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
@@ -890,11 +912,96 @@ std::string fs_get_cache_file(const std::string & filename) {
    return cache_directory + filename;
 }

+std::vector<common_file_info> fs_list_files(const std::string & path) {
+    std::vector<common_file_info> files;
+    if (path.empty()) return files;
+
+    std::filesystem::path dir(path);
+    if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
+        return files;
+    }
+
+    for (const auto & entry : std::filesystem::directory_iterator(dir)) {
+        try {
+            // Only include regular files (skip directories)
+            const auto & p = entry.path();
+            if (std::filesystem::is_regular_file(p)) {
+                common_file_info info;
+                info.path = p.string();
+                info.name = p.filename().string();
+                try {
+                    info.size = static_cast<size_t>(std::filesystem::file_size(p));
+                } catch (const std::filesystem::filesystem_error &) {
+                    info.size = 0;
+                }
+                files.push_back(std::move(info));
+            }
+        } catch (const std::filesystem::filesystem_error &) {
+            // skip entries we cannot inspect
+            continue;
+        }
+    }
+
+    return files;
+}
+

 //
 // Model utils
 //

+static inline void common_init_sampler_from_model(
+    const llama_model * model,
+    common_params_sampling & sparams) {
+
+    const uint64_t config = sparams.user_sampling_config;
+
+    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
+        if (config & user_config) return;
+
+        char buf[64] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            int32_t v = strtol(buf, &end, 10);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
+        if (config & user_config) return;
+
+        char buf[128] = {0};
+        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
+            char * end = nullptr;
+            float v = strtof(buf, &end);
+            if (end && end != buf) dst = v;
+        }
+    };
+
+    // Sampling sequence
+    if (!(config & common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS)) {
+        char buf[512] = {0};
+        if (llama_model_meta_val_str(model, llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_SEQUENCE), buf, sizeof(buf)) > 0) {
+            const std::vector<std::string> sampler_names = string_split<std::string>(std::string(buf), ';');
+            if (!sampler_names.empty()) {
+                sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            }
+        }
+    }
+
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_K),           sparams.top_k,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TOP_P),           sparams.top_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIN_P),           sparams.min_p,           common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_PROBABILITY), sparams.xtc_probability, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_XTC_THRESHOLD),   sparams.xtc_threshold,   common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_TEMP),            sparams.temp,            common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TEMP);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_LAST_N),  sparams.penalty_last_n,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_PENALTY_REPEAT),  sparams.penalty_repeat,  common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT);
+    get_int32(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT),        sparams.mirostat,        common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_TAU),    sparams.mirostat_tau,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU);
+    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
+}
+
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);
@@ -906,6 +1013,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }

+    common_init_sampler_from_model(model, params.sampling);
+
    const llama_vocab * vocab = llama_model_get_vocab(model);

    auto cparams = common_context_params_to_llama(params);
@@ -960,15 +1069,13 @@ struct common_init_result common_init_from_params(common_params & params) {

        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
+        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;

-        if (!has_eos && !has_sep) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
+        if (!has_eos && !has_sep && !has_rerank_prompt) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
            ok = false;
        } else if (!has_eos) {
            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
-        } else if (!has_sep) {
-            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
-            ok = false;
        }

        if (!ok) {
@@ -1117,6 +1224,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
+    mparams.no_host         = params.no_host;

    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
--- a/common/common.h
+++ b/common/common.h
@@ -2,17 +2,15 @@

 #pragma once

+#include "ggml-opt.h"
+#include "llama-cpp.h"
+
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <map>
-#include <sstream>
-#include <cmath>
-
-#include "ggml-opt.h"
-#include "llama-cpp.h"

 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -30,6 +28,15 @@

 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

+struct common_time_meas {
+    common_time_meas(int64_t & t_acc, bool disable = false);
+    ~common_time_meas();
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
 struct common_adapter_lora_info {
    std::string path;
    float scale;
@@ -133,6 +140,22 @@ struct common_grammar_trigger {
    llama_token token = LLAMA_TOKEN_NULL;
 };

+enum common_params_sampling_config : uint64_t {
+    COMMON_PARAMS_SAMPLING_CONFIG_SAMPLERS        = 1 << 0,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_K           = 1 << 1,
+    COMMON_PARAMS_SAMPLING_CONFIG_TOP_P           = 1 << 2,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIN_P           = 1 << 3,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY = 1 << 4,
+    COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD   = 1 << 5,
+    COMMON_PARAMS_SAMPLING_CONFIG_TEMP            = 1 << 6,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_LAST_N  = 1 << 7,
+    COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT  = 1 << 8,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT        = 1 << 9,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU    = 1 << 10,
+    COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA    = 1 << 11,
+};
+
+
 // sampling parameters
 struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -165,6 +188,8 @@ struct common_params_sampling {
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;

+    uint64_t user_sampling_config = 0; // bitfield to track user-specified samplers
+
    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY


@@ -378,7 +403,7 @@ struct common_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = false;  // context shift on infinite text generation
+    bool ctx_shift         = false; // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache

@@ -392,6 +417,7 @@ struct common_params {
    bool check_tensors     = false; // validate tensor data
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
+    bool no_host           = false; // bypass host buffer allowing extra buffers to be used

    bool single_turn       = false; // single turn chat conversation

@@ -405,6 +431,8 @@ struct common_params {
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
+    int image_min_tokens = -1;
+    int image_max_tokens = -1;

    // finetune
    struct lr_opt lr;
@@ -424,7 +452,8 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot
+    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
+    int32_t cache_ram_mib     = 8192;         // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
@@ -432,7 +461,7 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int reasoning_budget = -1;
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

@@ -456,7 +485,8 @@ struct common_params {
    float slot_prompt_similarity = 0.1f;

    // batched-bench params
-    bool is_pp_shared = false;
+    bool is_pp_shared   = false;
+    bool is_tg_separate = false;

    std::vector<int32_t> n_pp;
    std::vector<int32_t> n_tg;
@@ -503,6 +533,10 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
+
+    bool has_speculative() const {
+        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
+    }
 };

 // call once at the start of a program if it uses libcommon
@@ -603,6 +637,13 @@ bool fs_create_directory_with_parents(const std::string & path);
 std::string fs_get_cache_directory();
 std::string fs_get_cache_file(const std::string & filename);

+struct common_file_info {
+    std::string path;
+    std::string name;
+    size_t      size = 0; // in bytes
+};
+std::vector<common_file_info> fs_list_files(const std::string & path);
+
 //
 // Model utils
 //
@@ -738,7 +779,7 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 // MoE utils
 //

-const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";

 static std::string llm_ffn_exps_block_regex(int idx) {
    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
--- a/common/download.cpp
+++ b/common/download.cpp
--- a/common/download.h
+++ b/common/download.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <string>
+
+struct common_params_model;
+
+//
+// download functionalities
+//
+
+struct common_cached_model_info {
+    std::string manifest_path;
+    std::string user;
+    std::string model;
+    std::string tag;
+    size_t      size = 0; // GGUF size in bytes
+    std::string to_string() const {
+        return user + "/" + model + ":" + tag;
+    }
+};
+
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
+};
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+common_hf_file_res common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & bearer_token,
+    bool offline);
+
+// returns true if download succeeded
+bool common_download_model(
+    const common_params_model & model,
+    const std::string & bearer_token,
+    bool offline);
+
+// returns list of cached models
+std::vector<common_cached_model_info> common_list_cached_models();
+
+// resolve and download model from Docker registry
+// return local path to downloaded model file
+std::string common_docker_resolve_model(const std::string & docker);
--- a/common/http.h
+++ b/common/http.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <cpp-httplib/httplib.h>
+
+struct common_http_url {
+    std::string scheme;
+    std::string user;
+    std::string password;
+    std::string host;
+    std::string path;
+};
+
+static common_http_url common_http_parse_url(const std::string & url) {
+    common_http_url parts;
+    auto scheme_end = url.find("://");
+
+    if (scheme_end == std::string::npos) {
+        throw std::runtime_error("invalid URL: no scheme");
+    }
+    parts.scheme = url.substr(0, scheme_end);
+
+    if (parts.scheme != "http" && parts.scheme != "https") {
+        throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
+    }
+
+    auto rest = url.substr(scheme_end + 3);
+    auto at_pos = rest.find('@');
+
+    if (at_pos != std::string::npos) {
+        auto auth = rest.substr(0, at_pos);
+        auto colon_pos = auth.find(':');
+        if (colon_pos != std::string::npos) {
+            parts.user = auth.substr(0, colon_pos);
+            parts.password = auth.substr(colon_pos + 1);
+        } else {
+            parts.user = auth;
+        }
+        rest = rest.substr(at_pos + 1);
+    }
+
+    auto slash_pos = rest.find('/');
+
+    if (slash_pos != std::string::npos) {
+        parts.host = rest.substr(0, slash_pos);
+        parts.path = rest.substr(slash_pos);
+    } else {
+        parts.host = rest;
+        parts.path = "/";
+    }
+    return parts;
+}
+
+static std::pair<httplib::Client, common_http_url> common_http_client(const std::string & url) {
+    common_http_url parts = common_http_parse_url(url);
+
+    if (parts.host.empty()) {
+        throw std::runtime_error("error: invalid URL format");
+    }
+
+    httplib::Client cli(parts.scheme + "://" + parts.host);
+
+    if (!parts.user.empty()) {
+        cli.set_basic_auth(parts.user, parts.password);
+    }
+
+    cli.set_follow_location(true);
+
+    return { std::move(cli), std::move(parts) };
+}
+
+static std::string common_http_show_masked_url(const common_http_url & parts) {
+    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
+}
--- a/common/json-partial.cpp
+++ b/common/json-partial.cpp
@@ -5,6 +5,7 @@
 #include <nlohmann/json.hpp>

 #include <string>
+#include <regex>

 using json = nlohmann::ordered_json;

@@ -168,6 +169,47 @@ bool common_json_parse(
                }
            }

+            // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX
+            static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)");
+
+            auto is_high_surrogate = [&](const std::string & s) {
+                // Check if a partial of a high surrogate (U+D800-U+DBFF)
+                return s.length() >= 4 &&
+                    s[0] == '\\' && s[1] == 'u' &&
+                    std::tolower(s[2]) == 'd' &&
+                    (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b');
+            };
+
+            // Initialize the unicode marker to a low surrogate to handle the edge case
+            // where a high surrogate (U+D800-U+DBFF) is immediately followed by a
+            // backslash (\)
+            std::string unicode_marker_padding = "udc00";
+            std::smatch last_unicode_seq;
+
+            if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) {
+                std::smatch second_last_seq;
+                std::string prelude = str.substr(0, last_unicode_seq.position());
+
+                // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters
+                unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0');
+
+                if (is_high_surrogate(last_unicode_seq.str())) {
+                    // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF)
+                    unicode_marker_padding += "\\udc00";
+                } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) {
+                    if (is_high_surrogate(second_last_seq.str())) {
+                        // If this follows a high surrogate, pad it to be a low surrogate
+                        if (last_unicode_seq.length() == 2) {
+                            unicode_marker_padding = "dc00";
+                        } else if (last_unicode_seq.length() == 3) {
+                            unicode_marker_padding = "c00";
+                        } else {
+                            // The original unicode_marker_padding is already padded with 0s
+                        }
+                    }
+                }
+            }
+
            const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$";

            if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) {
@@ -186,6 +228,9 @@ bool common_json_parse(
                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
                    // Was inside an object value string after an escape
                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
+                    // Was inside an object value string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
                } else {
                    // find last :
                    auto last_pos = str.find_last_of(':');
@@ -205,6 +250,9 @@ bool common_json_parse(
                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) {
                    // Was inside an array value string after an escape
                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) {
+                    // Was inside an array value string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing;
                } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) {
                    // Had just finished a value
                    str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing;
@@ -230,6 +278,9 @@ bool common_json_parse(
                } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) {
                    // Was inside an object key string after an escape
                    str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing;
+                } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) {
+                    // Was inside an object key string after a partial unicode escape
+                    str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing;
                } else {
                    auto last_pos = str.find_last_of(':');
                    if (last_pos == std::string::npos) {
@@ -246,8 +297,25 @@ bool common_json_parse(
            it = temptative_end;
            return true;
        }
-        // TODO: handle unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
-        // fprintf(stderr, "Closing: TODO\n");
+        // handle unclosed top-level primitive
+        if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) {
+            std::string str(it, temptative_end);
+            const auto & magic_seed = out.healing_marker.marker = healing_marker;
+            if (can_parse(str + "\"")) {
+                // Was inside an string
+                str += (out.healing_marker.json_dump_marker = magic_seed) + "\"";
+            } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) {
+                // Was inside an string after an escape
+                str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"";
+            } else {
+                // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...)
+                // fprintf(stderr, "Closing: TODO\n");
+                return false;
+            }
+            out.json = json::parse(str);
+            it = temptative_end;
+            return true;
+        }
        return false;
    }
    out.json = json::parse(it, end);
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
    return result;
 }

-static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
-    auto has_min = min_value != std::numeric_limits<int>::min();
-    auto has_max = max_value != std::numeric_limits<int>::max();
+static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
+    auto has_min = min_value != std::numeric_limits<int64_t>::min();
+    auto has_max = max_value != std::numeric_limits<int64_t>::max();

    auto digit_range = [&](char from, char to) {
        out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
    if (has_min) {
        if (min_value < 0) {
            out << "\"-\" (";
-            _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
            out << ") | [0] | [1-9] ";
            more_digits(0, decimals_left - 1);
        } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            }
            digit_range(c, c);
            out << " (";
-            _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
+            _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
            out << ")";
            if (c < '9') {
                out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
            _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
        } else {
            out << "\"-\" (";
-            _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
+            _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
            out << ")";
        }
        return;
@@ -268,10 +268,10 @@ static bool is_reserved_name(const std::string & name) {
 }

 std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
-std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"\\\\]");
 std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
 std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
-    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
+    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}, {'\\', "\\\\"}
 };

 std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
@@ -303,6 +303,8 @@ static std::string format_literal(const std::string & literal) {
    return "\"" + escaped + "\"";
 }

+std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
+
 class SchemaConverter {
 private:
    friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
@@ -601,7 +603,10 @@ private:
    }

    std::string _resolve_ref(const std::string & ref) {
-        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        auto it = ref.find('#');
+        std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
+        static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
+        std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
            _refs_being_resolved.insert(ref);
            json resolved = _refs[ref];
@@ -774,11 +779,24 @@ public:
                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
-                            if (target.is_null() || !target.contains(sel)) {
+                            if (target.is_object() && target.contains(sel)) {
+                                target = target[sel];
+                            } else if (target.is_array()) {
+                                size_t sel_index;
+                                try {
+                                    sel_index = std::stoul(sel);
+                                } catch (const std::invalid_argument & e) {
+                                    sel_index = target.size();
+                                }
+                                if (sel_index >= target.size()) {
+                                    _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                    return;
+                                }
+                                target = target[sel_index];
+                            } else {
                                _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
                                return;
                            }
-                            target = target[sel];
                        }
                        _refs[ref] = target;
                    }
@@ -925,17 +943,17 @@ public:
            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
-            int min_value = std::numeric_limits<int>::min();
-            int max_value = std::numeric_limits<int>::max();
+            int64_t min_value = std::numeric_limits<int64_t>::min();
+            int64_t max_value = std::numeric_limits<int64_t>::max();
            if (schema.contains("minimum")) {
-                min_value = schema["minimum"].get<int>();
+                min_value = schema["minimum"].get<int64_t>();
            } else if (schema.contains("exclusiveMinimum")) {
-                min_value = schema["exclusiveMinimum"].get<int>() + 1;
+                min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
            }
            if (schema.contains("maximum")) {
-                max_value = schema["maximum"].get<int>();
+                max_value = schema["maximum"].get<int64_t>();
            } else if (schema.contains("exclusiveMaximum")) {
-                max_value = schema["exclusiveMaximum"].get<int>() - 1;
+                max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
            }
            std::stringstream out;
            out << "(";
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -18,4 +18,6 @@ struct common_grammar_options {
    bool dotall = false;
 };

+std::string gbnf_format_literal(const std::string & literal);
+
 std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options = {});
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -442,3 +442,9 @@ void common_log_set_prefix(struct common_log * log, bool prefix) {
 void common_log_set_timestamps(struct common_log * log, bool timestamps) {
    log->set_timestamps(timestamps);
 }
+
+void common_log_default_callback(enum ggml_log_level level, const char * text, void * /*user_data*/) {
+    if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
+        common_log_add(common_log_main(), level, "%s", text);
+    }
+}
--- a/common/log.h
+++ b/common/log.h
@@ -36,6 +36,8 @@ extern int common_log_verbosity_thold;

 void common_log_set_verbosity_thold(int verbosity); // not thread-safe

+void common_log_default_callback(enum ggml_log_level level, const char * text, void * user_data);
+
 // the common_log uses an internal worker thread to print/write log messages
 // when the worker thread is paused, incoming log messages are discarded
 struct common_log;
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -3,9 +3,10 @@
 #include "common.h"
 #include "log.h"

-#include <cmath>
-#include <unordered_map>
 #include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <unordered_map>

 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
@@ -112,6 +113,13 @@ struct common_sampler {

    llama_token_data_array cur_p;

+    void reset() {
+        prev.clear();
+
+        llama_sampler_reset(grmr);
+        llama_sampler_reset(chain);
+    }
+
    void set_logits(struct llama_context * ctx, int idx) {
        const auto * logits = llama_get_logits_ith(ctx, idx);

@@ -128,6 +136,12 @@ struct common_sampler {

        cur_p = { cur.data(), cur.size(), -1, false };
    }
+
+    common_time_meas tm() {
+        return common_time_meas(t_total_us, params.no_perf);
+    }
+
+    mutable int64_t t_total_us = 0;
 };

 std::string common_params_sampling::print() const {
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 }

 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
+    const auto tm = gsmpl->tm();
+
    if (accept_grammar) {
        llama_sampler_accept(gsmpl->grmr, token);
    }
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
 }

 void common_sampler_reset(struct common_sampler * gsmpl) {
-    llama_sampler_reset(gsmpl->grmr);
-
-    llama_sampler_reset(gsmpl->chain);
+    gsmpl->reset();
 }

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
@@ -327,15 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
    // TODO: measure grammar performance

+    const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;
+
+    llama_perf_sampler_data data_smpl;
+    llama_perf_context_data data_ctx;
+
+    memset(&data_smpl, 0, sizeof(data_smpl));
+    memset(&data_ctx,  0, sizeof(data_ctx));
+
    if (gsmpl) {
-        llama_perf_sampler_print(gsmpl->chain);
+        auto & data = data_smpl;
+
+        data = llama_perf_sampler(gsmpl->chain);
+
+        // note: the sampling time includes the samplers time + extra time spent in common/sampling
+        LOG_INF("%s:    sampling time = %10.2f ms\n", __func__, t_sampling_ms);
+        LOG_INF("%s:    samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
    }
+
    if (ctx) {
-        llama_perf_context_print(ctx);
+        auto & data = data_ctx;
+
+        data = llama_perf_context(ctx);
+
+        const double t_end_ms = 1e-3 * ggml_time_us();
+
+        const double t_total_ms = t_end_ms - data.t_start_ms;
+        const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
+        const double t_unacc_pc = 100.0 * t_unacc_ms /  t_total_ms;
+
+        LOG_INF("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+        LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+        LOG_INF("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+                __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+        LOG_INF("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+        LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %%      (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
+        LOG_INF("%s:    graphs reused = %10d\n", __func__, data.n_reused);
+
+        llama_memory_breakdown_print(ctx);
    }
 }

 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+    llama_synchronize(ctx);
+
+    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
+    const auto tm = gsmpl->tm();
+
    gsmpl->set_logits(ctx, idx);

    auto & grmr  = gsmpl->grmr;
@@ -427,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
 // helpers

 llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+    const auto tm = gsmpl->tm();
+
    auto * res = &gsmpl->cur_p;

    if (do_sort && !res->sorted) {
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,7 +139,10 @@ models = [
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
-    {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
+    {"name": "afmoe",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/arcee-ai/Trinity-Tokenizer", },
+    {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
+    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
+    {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -434,7 +437,7 @@ for model in models:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
        else:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
+    except (OSError, TypeError) as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop

--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -242,7 +242,7 @@ def parse_args() -> argparse.Namespace:
        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
    )
    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16",
+        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f32",
        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
    )
    parser.add_argument(
@@ -277,10 +277,15 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+def load_hparams_from_hf(hf_model_id: str) -> tuple[dict[str, Any], Path | None]:
+    from huggingface_hub import try_to_load_from_cache
+
    # normally, adapter does not come with base model config, we need to load it from AutoConfig
    config = AutoConfig.from_pretrained(hf_model_id)
-    return config.to_dict()
+    cache_dir = try_to_load_from_cache(hf_model_id, "config.json")
+    cache_dir = Path(cache_dir).parent if isinstance(cache_dir, str) else None
+
+    return config.to_dict(), cache_dir


 if __name__ == '__main__':
@@ -325,13 +330,13 @@ if __name__ == '__main__':
    # load base model
    if base_model_id is not None:
        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
-        hparams = load_hparams_from_hf(base_model_id)
+        hparams, dir_base_model = load_hparams_from_hf(base_model_id)
    elif dir_base_model is None:
        if "base_model_name_or_path" in lparams:
            model_id = lparams["base_model_name_or_path"]
            logger.info(f"Loading base model from Hugging Face: {model_id}")
            try:
-                hparams = load_hparams_from_hf(model_id)
+                hparams, dir_base_model = load_hparams_from_hf(model_id)
            except OSError as e:
                logger.error(f"Failed to load base model config: {e}")
                logger.error("Please try downloading the base model and add its path to --base")
@@ -480,6 +485,7 @@ if __name__ == '__main__':
            dir_lora_model=dir_lora,
            lora_alpha=alpha,
            hparams=hparams,
+            remote_hf_model_id=base_model_id,
        )

        logger.info("Exporting model...")
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -313,7 +313,12 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable

 ### GGML_CANN_ACL_GRAPH

-Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
+Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default. This option is only effective if `USE_ACL_GRAPH` was enabled at compilation time. To enable it, recompile using:
+
+```sh
+cmake -B build -DGGML_CANN=on -DCMAKE_BUILD_TYPE=release -DUSE_ACL_GRAPH=ON
+cmake --build build --config release
+```

 ### GGML_CANN_GRAPH_CACHE_CAPACITY

--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -39,18 +39,23 @@ The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adren
 | Adreno 830 (Snapdragon 8 Elite)      | Support |
 | Adreno X85 (Snapdragon X Elite)      | Support |

+> A6x GPUs with a recent driver and compiler are supported; they are usually found in IoT platforms.
+However, A6x GPUs in phones are likely not supported due to the outdated driver and compiler.
+
 ## DataType Supports

 | DataType               | Status                     |
 |:----------------------:|:--------------------------:|
 | Q4_0                   | Support                    |
 | Q6_K                   | Support, but not optimized |
+| Q8_0                   | Support                    |
+| MXFP4                  | Support                    |

 ## Model Preparation

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.
+You can refer to the general [llama-quantize tool](/tools/quantize/README.md) for steps to convert a model in Hugging Face safetensor format to GGUF with quantization.

-Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,
+Currently we support `Q4_0` quantization and have optimized for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize` (i.e., make all weights in `Q4_0`). For example,

 ```sh
 ./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
@@ -58,6 +63,17 @@ Currently we support `Q4_0` quantization and have optimize for it. To achieve be

 Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.

+### `MXFP4` MoE Models
+
+OpenAI gpt-oss models are MoE models in `MXFP4`. The quantized model will be in `MXFP4_MOE`, a mixture of `MXFP4` and `Q8_0`.
+For this quantization, there is no need to specify `--pure`.
+For gpt-oss-20b model, you can directly [download](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) the quantized GGUF file in `MXFP4_MOE` from Hugging Face.
+
+Although it is possible to quantize gpt-oss-20b model in pure `Q4_0` (all weights in `Q4_0`), it is not recommended since `MXFP4` has been optimized for MoE while `Q4_0` is not. In addition, accuracy should degrade with such pure `Q4_0` quantization.
+Hence, using the default `MXFP4_MOE` quantization (see the link above) is recommended for this model.
+
+> Note that the `Q4_0` model found [here](https://huggingface.co/unsloth/gpt-oss-20b-GGUF/blob/main/gpt-oss-20b-Q4_0.gguf) is a mixture of `Q4_0`, `Q8_0` and `MXFP4` and gives better performance than `MXFP4_MOE` quantization.
+
 ## CMake Options

 The OpenCL backend has the following CMake options that control the behavior of the backend.
@@ -146,10 +162,13 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
 * Ninja
 * Visual Studio 2022
 * Powershell 7
+* Python

 Visual Studio provides necessary headers and libraries although it is not directly used for building.
 Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.

+> Note that building using Visual Studio's cl compiler is not supported. Clang must be used. Clang depends on libraries provided by Visual Studio to work. Therefore, Visual Studio must be installed. Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
+
 Powershell 7 is used for the following commands.
 If an older version of Powershell is used, these commands may not work as they are.

@@ -201,9 +220,12 @@ ninja

 ## Known Issues

- Currently OpenCL backend does not work on Adreno 6xx GPUs.
+- Flash attention does not always improve performance.
+- Currently OpenCL backend works on A6xx GPUs with recent drivers and compilers (usually found in IoT platforms).
+  However, it does not work on A6xx GPUs found in phones with old drivers and compilers.

 ## TODO

 - Optimization for Q6_K
 - Support and optimization for Q4_K
+- Improve flash attention
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -42,6 +42,9 @@ The following releases are verified and recommended:

 ## News

+- 2025.11
+  - Support malloc memory on device more than 4GB.
+
 - 2025.2
  - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC).
    |GPU|Base tokens/s|Increased tokens/s|Percent|
@@ -145,12 +148,13 @@ The docker build option is currently limited to *Intel GPU* targets.
 ```sh
 # Using FP16
 docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
+
+# Using FP32
+docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=OFF" --target light -f .devops/intel.Dockerfile .
 ```

 *Notes*:

-To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.
-
 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
 Check the [documentation for Docker](../docker.md) to see the available images.

@@ -160,7 +164,7 @@ Check the [documentation for Docker](../docker.md) to see the available images.
 # First, find all the DRI cards
 ls -la /dev/dri
 # Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+docker run -it --rm -v "/path/to/models:/models" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card0:/dev/dri/card0 llama-cpp-sycl -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -c 4096 -s 0
 ```

 *Notes:*
@@ -215,9 +219,19 @@ To target AMD GPUs with SYCL, the ROCm stack must be installed first.

 2. **Install Intel® oneAPI Base toolkit**

+SYCL backend depends on:
+  - Intel® oneAPI DPC++/C++ compiler/running-time.
+  - Intel® oneAPI DPC++/C++ library (oneDPL).
+  - Intel® oneAPI Deep Neural Network Library (oneDNN).
+  - Intel® oneAPI Math Kernel Library (oneMKL).
+
 - **For Intel GPU**

-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+All above are included in both **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** packages.
+
+It's recommended to install **Intel® Deep Learning Essentials** which only provides the necessary libraries with less size.
+
+The **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

 Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.

@@ -225,6 +239,12 @@ Following guidelines/code snippets assume the default installation values. Other

 Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.

+|Verified release|
+|-|
+|2025.2.1|
+|2025.1|
+|2024.1|
+
 - **Adding support to Nvidia GPUs**

 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
@@ -255,10 +275,11 @@ sycl-ls
 When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:

 ```
-[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+[level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Arc(TM) A770 Graphics 12.55.8 [1.3.29735+27]
+[level_zero:gpu][level_zero:1] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) UHD Graphics 730 12.2.0 [1.3.29735+27]
+[opencl:cpu][opencl:0] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i5-13400 OpenCL 3.0 (Build 0) [2025.20.8.0.06_160000]
+[opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [24.39.31294]
+[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 730 OpenCL 3.0 NEO  [24.39.31294]
 ```

 - **Nvidia GPU**
@@ -353,7 +374,7 @@ cmake --build build --config Release -j -v

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -466,7 +487,17 @@ If you already have a recent version of Microsoft Visual Studio, you can skip th

 3. Install Intel® oneAPI Base toolkit

-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+SYCL backend depends on:
+  - Intel® oneAPI DPC++/C++ compiler/running-time.
+  - Intel® oneAPI DPC++/C++ library (oneDPL).
+  - Intel® oneAPI Deep Neural Network Library (oneDNN).
+  - Intel® oneAPI Math Kernel Library (oneMKL).
+
+All above are included in both **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** packages.
+
+It's recommended to install **Intel® Deep Learning Essentials** which only provides the necessary libraries with less size.
+
+The **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

 Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.

@@ -761,6 +792,8 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
 | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. |
 | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
+| UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Support malloc device memory more than 4GB.|
+


 ## Known Issues
@@ -807,6 +840,14 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  | The default context is too big. It leads to excessive memory usage.|Set `-c 8192` or a smaller value.|
  | The model is too big and requires more memory than what is available.|Choose a smaller model or change to a smaller quantization, like Q5 -> Q4;<br>Alternatively, use more than one device to load model.|

+- `ggml_backend_sycl_buffer_type_alloc_buffer: can't allocate 5000000000 Bytes of memory on device`
+
+  You need to enable to support 4GB memory malloc by:
+  ```
+    export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+    set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
+  ```
+
 ### **GitHub contribution**:
 Please add the `SYCL :` prefix/tag in issues/PRs titles to help the SYCL contributors to check/address them without delay.

--- a/docs/backend/hexagon/CMakeUserPresets.json
+++ b/docs/backend/hexagon/CMakeUserPresets.json
@@ -0,0 +1,49 @@
+{
+  "version": 4,
+  "configurePresets": [
+    {
+        "name": "arm64-android-snapdragon",
+        "hidden": true,
+        "architecture": { "value": "arm64",       "strategy": "external" },
+        "toolset":      { "value": "host=x86_64", "strategy": "external" },
+        "cacheVariables": {
+            "ANDROID_ABI":      "arm64-v8a",
+            "ANDROID_PLATFORM": "android-31",
+            "CMAKE_TOOLCHAIN_FILE": "$env{ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -fno-finite-math-only -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
+            "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
+            "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
+            "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "-O3 -DNDEBUG -g",
+            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "PREBUILT_LIB_DIR": "android_aarch64",
+            "GGML_OPENMP":      "OFF",
+            "GGML_LLAMAFILE":   "OFF",
+            "GGML_OPENCL":      "ON",
+            "GGML_HEXAGON":     "ON",
+            "LLAMA_CURL":       "OFF"
+        }
+    },
+
+    {
+        "name": "arm64-windows-snapdragon",
+        "inherits": [ "base", "arm64-windows-llvm" ],
+        "cacheVariables": {
+            "HEXAGON_SDK_ROOT": "$env{HEXAGON_SDK_ROOT}",
+            "PREBUILT_LIB_DIR": "windows_aarch64",
+            "GGML_OPENMP":      "OFF",
+            "GGML_LLAMAFILE":   "OFF",
+            "GGML_OPENCL":      "ON",
+            "GGML_HEXAGON":     "ON",
+            "LLAMA_CURL":       "OFF"
+        }
+    },
+
+    { "name": "arm64-android-snapdragon-debug"  , "inherits": [ "base", "arm64-android-snapdragon", "debug" ] },
+    { "name": "arm64-android-snapdragon-release", "inherits": [ "base", "arm64-android-snapdragon", "release" ] },
+
+    { "name": "arm64-windows-snapdragon-debug"  , "inherits": [ "base", "arm64-windows-snapdragon", "debug" ] },
+    { "name": "arm64-windows-snapdragon-release", "inherits": [ "base", "arm64-windows-snapdragon", "release" ] }
+  ]
+}
--- a/docs/backend/hexagon/README.md
+++ b/docs/backend/hexagon/README.md
@@ -0,0 +1,239 @@
+# Snapdragon-based Android devices
+
+## How to Build
+
+The easiest way to build llama.cpp for a Snapdragon-based Android device is using the toolchain Docker image (see github.com/snapdragon-toolchain).
+This image includes Android NDK, OpenCL SDK, Hexagon SDK, CMake, etc.
+
+This method works on Linux, macOS, and Windows. macOS and Windows users should install Docker Desktop.
+
+```
+~/src/llama.cpp$ docker run -it -u $(id -u):$(id -g) --volume $(pwd):/workspace --platform linux/amd64 ghcr.io/snapdragon-toolchain/arm64-android:v0.3
+[d]/> cd /workspace
+```
+
+The rest of the Android build process assumes that you're running inside the toolchain container.
+Let's build llama.cpp with CPU, OpenCL, and Hexagon backends via CMake presets:
+
+```
+[d]/workspace> cp docs/backend/hexagon/CMakeUserPresets.json .
+
+[d]/workspace> cmake --preset arm64-android-snapdragon-release -B build-snapdragon
+Preset CMake variables:
+  ANDROID_ABI="arm64-v8a"
+  ...
+  CMAKE_TOOLCHAIN_FILE="/opt/android-ndk-r28b/build/cmake/android.toolchain.cmake"
+  GGML_HEXAGON="ON"
+  GGML_OPENCL="ON"
+  GGML_OPENMP="OFF"
+  HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
+...
+-- Including OpenCL backend
+-- Including Hexagon backend
+...
+-- Build files have been written to: /workspace/build-snapdragon
+
+[d]/workspace> cmake --build build-snapdragon
+...
+[144/356] Performing build step for 'htp-v73'
+[1/16] Generating htp_iface_skel.c, htp_iface_stub.c, htp_iface.h
+[2/16] Building C object CMakeFiles/ggml-htp-v73.dir/hvx-sigmoid.c.obj
+[3/16] Building C object CMakeFiles/ggml-htp-v73.dir/htp-dma.c.obj
+[4/16] Building C object CMakeFiles/ggml-htp-v73.dir/worker-pool.c.obj
+...
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v73.so
+-- Installing: /workspace/build-snapdragon/ggml/src/ggml-hexagon/libggml-htp-v75.so
+...
+```
+
+To generate an installable "package" simply use cmake --install:
+
+```
+[d]/workspace> cmake --install build-snapdragon --prefix pkg-adb/llama.cpp
+-- Install configuration: "Release"
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-cpu.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-opencl.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-hexagon.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v73.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml-htp-v81.so
+-- Installing: /workspace/pkg-adb/llama.cpp/lib/libggml.so
+...
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
+-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
+...
+```
+
+## How to Install
+
+For this step, your device needs to be configured for on-device development.
+Please see https://developer.android.com/studio/debug/dev-options for details.
+
+Once ADB is enabled, use `adb push` to install `pkg-snapdragon` on the device.
+**Note that the toolchain Docker image doesn't have ADB and doesn't set up the ADB bridge. Please use native ADB on the host.**
+
+```
+~/src/llama.cpp$ adb push pkg-adb/llama.cpp /data/local/tmp/
+pkg-adb/llama.cpp/bin/: 67 files pushed, 0 skipped. 190.2 MB/s (919095042 bytes in 4.607s)
+pkg-adb/llama.cpp/include/: 19 files pushed, 0 skipped. 20.5 MB/s (255173 bytes in 0.012s)
+pkg-adb/llama.cpp/lib/: 16 files pushed, 0 skipped. 144.4 MB/s (43801382 bytes in 0.289s)
+102 files pushed, 0 skipped. 186.9 MB/s (963151597 bytes in 4.914s)
+```
+
+At this point, you should also install some models:
+
+```
+~/src/llama.cpp$ wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf
+...
+2025-10-11 12:04:52 (10.7 MB/s) - ‘Llama-3.2-1B-Instruct-Q4_0.gguf’ saved [773025920/773025920]
+
+~/src/llama.cpp$ adb push Llama-3.2-1B-Instruct-Q4_0.gguf /data/local/tmp/gguf
+Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920 bytes in 19.250s)
+```
+
+## How to Run
+
+The easiest way to run llama.cpp cli tools is using provided wrapper scripts that properly set up all required environment variables.
+
+llama.cpp supports three backends on Snapdragon-based devices: CPU, Adreno GPU (GPUOpenCL), and Hexagon NPU (HTP0-4).
+You can select which backend to run the model on using the `D=` variable, which maps to the `--device` option.
+
+Hexagon NPU behaves as a "GPU" device when it comes to `-ngl` and other offload-related options.
+
+Here are some examples of running various llama.cpp tools via ADB.
+
+Simple question for Llama-3.2-1B
+
+```
+~/src/llama.cpp$ M=Llama-3.2-1B-Instruct-Q4_0.gguf D=HTP0 ./scripts/snapdragon/adb/run-cli.sh -no-cnv -p "what is the most popular cookie in the world?"
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb4000072c7955e50
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors:          CPU model buffer size =   225.49 MiB
+load_tensors:         HTP0 model buffer size =     0.26 MiB
+load_tensors:  HTP0-REPACK model buffer size =   504.00 MiB
+...
+I hope this helps you understand the world's most popular cookies! [end of text]
+...
+llama_perf_sampler_print:    sampling time =      30.08 ms /   487 runs   (    0.06 ms per token, 16191.77 tokens per second)
+llama_perf_context_print:        load time =     617.94 ms
+llama_perf_context_print: prompt eval time =      80.76 ms /    11 tokens (    7.34 ms per token,   136.21 tokens per second)
+llama_perf_context_print:        eval time =    9210.59 ms /   475 runs   (   19.39 ms per token,    51.57 tokens per second)
+llama_perf_context_print:       total time =    9454.92 ms /   486 tokens
+llama_perf_context_print:    graphs reused =        473
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                  439 =   225 +     136 +      77                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                  504 =   504 +       0 +       0                |
+```
+
+Summary request for OLMoE-1B-7B. This is a large model that requires two HTP sessions/devices
+
+```
+~/src/llama.cpp$ M=OLMoE-1B-7B-0125-Instruct-Q4_0.gguf NDEV=2 D=HTP0,HTP1 ./scripts/snapdragon/adb/run-cli.sh -f surfing.txt -no-cnv
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v81
+ggml-hex: allocating new session: HTP0
+ggml-hex: allocating new session: HTP1
+...
+load_tensors: offloading output layer to GPU
+load_tensors: offloaded 17/17 layers to GPU
+load_tensors:          CPU model buffer size =   143.86 MiB
+load_tensors:         HTP1 model buffer size =     0.23 MiB
+load_tensors:  HTP1-REPACK model buffer size =  1575.00 MiB
+load_tensors:         HTP0 model buffer size =     0.28 MiB
+load_tensors:  HTP0-REPACK model buffer size =  2025.00 MiB
+...
+llama_context:        CPU  output buffer size =     0.19 MiB
+llama_kv_cache:       HTP1 KV buffer size =   238.00 MiB
+llama_kv_cache:       HTP0 KV buffer size =   306.00 MiB
+llama_kv_cache: size =  544.00 MiB (  8192 cells,  16 layers,  1/1 seqs), K (q8_0):  272.00 MiB, V (q8_0):  272.00 MiB
+llama_context:       HTP0 compute buffer size =    15.00 MiB
+llama_context:       HTP1 compute buffer size =    15.00 MiB
+llama_context:        CPU compute buffer size =    24.56 MiB
+...
+llama_perf_context_print: prompt eval time =    1730.57 ms /   212 tokens (    8.16 ms per token,   122.50 tokens per second)
+llama_perf_context_print:        eval time =    5624.75 ms /   257 runs   (   21.89 ms per token,    45.69 tokens per second)
+llama_perf_context_print:       total time =    7377.33 ms /   469 tokens
+llama_perf_context_print:    graphs reused =        255
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                  742 =   144 +     544 +      54                |
+llama_memory_breakdown_print: |   - HTP1-REPACK        |                 1575 =  1575 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2025 =  2025 +       0 +       0                |
+```
+
+Op test for MUL_MAT
+
+```
+~/src/llama.cpp$ HB=0 ./scripts/snapdragon/adb/run-tool.sh test-backend-ops -b HTP0 -o MUL_MAT
+...
+Backend 2/3: HTP0
+Device description: Hexagon
+Device memory: 2048 MB (2048 MB free)
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+MUL_MAT(type_a=q4_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1): OK
+
+~/src/llama.cpp-hexagon$ M=Llama-3.2-1B-Instruct-Q4_0.gguf ./scripts/snapdragon/adb/run-bench.sh -p 128 -n 64
+...
+ggml-hex: Hexagon backend (experimental) : allocating new registry : ndev 1
+ggml-hex: Hexagon Arch version v79
+ggml-hex: allocating new session: HTP0
+ggml-hex: new session: HTP0 : session-id 0 domain-id 3 uri file:///libggml-htp-v79.so?htp_iface_skel_handle_invoke&_modver=1.0&_dom=cdsp&_session=0 handle 0xb400007d4b231090
+| model          |       size | params | backend    | ngl | threads | n_batch | mmap |  test |           t/s |
+| ---------------| ---------: | -----: | ---------- | --: | ------: | ------: | ---: | ----: | ------------: |
+| llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 | pp128 | 169.42 ± 1.75 |
+| llama 1B Q4_0  | 729.75 MiB | 1.24 B | HTP        |  99 |       4 |     128 |    0 |  tg64 |  51.54 ± 1.13 |
+
+build: 6a8cf8914 (6733)
+```
+
+## Environment variables
+
+- `GGML_HEXAGON_NDEV=1`
+  Controls the number of devices/sessions to allocate. The default is 1.
+  Most quantized models under 4B fit into a single session; an 8B model needs two, and a 20B model needs four.
+
+- `GGML_HEXAGON_NHVX=0`
+  Controls the number of HVX hardware threads to use. The default is all (actual number varies depending on the hardware version).
+
+- `GGML_HEXAGON_HOSTBUF=1`
+  Controls whether the Hexagon backend allocates host buffers. By default, all buffers except for REPACK are host buffers.
+  This option is required for testing Ops that require REPACK buffers (MUL_MAT and MUL_MAT_ID).
+
+- `GGML_HEXAGON_VERBOSE=1`
+  Enables verbose logging of Ops from the backend. Example output:
+
+  ```
+  ggml-hex: HTP0 graph-compute n_nodes 2
+  ggml-hex: HTP0 matmul : blk.27.ffn_up.weight x ffn_norm-27 -> ffn_up-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x1
+  ggml-hex: HTP0 matmul : blk.27.ffn_gate.weight x ffn_norm-27 -> ffn_gate-27 : 3072:8192 x 3072:1 -> 8192:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x3
+  ggml-hex: HTP0 graph-compute n_nodes 1
+  ggml-hex: HTP0 matmul : blk.27.ffn_down.weight x ffn_gate_par-27 -> ffn_out-27 : 8192:3072 x 8192:1 -> 3072:1 : q4_0 x f32 -> f32 : HTP0 x HTP0 -> HTP0 : flags 0x0
+  ggml-hex: HTP0 get-tensor result_output : data 0x7592487000 offset 0 size 513024
+  ```
+
+- `GGML_HEXAGON_PROFILE=1`
+  Generates a host-side profile for the ggml-hexagon Ops.
+
+- `GGML_HEXAGON_OPMASK=0x0`
+  Allows enabling specific stages of the processing pipeline:
+
+  - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
+  - `0x2` Enable Dynamic Quantizer (if needed for the Op)
+  - `0x4` Enable Op Compute (MUL_MAT, etc.)
+
+  Examples:
+
+      `GGML_HEXAGON_OPMASK=0x1 llama-cli ...` - Ops are enqueued but NPU-side processing is stubbed out
+      `GGML_HEXAGON_OPMASK=0x3 llama-cli ...` - NPU performs dynamic quantization and skips the rest
+      `GGML_HEXAGON_OPMASK=0x7 llama-cli ...` - Full queuing and processing of Ops (default)
--- a/docs/backend/hexagon/developer.md
+++ b/docs/backend/hexagon/developer.md
@@ -0,0 +1,109 @@
+# Hexagon backend developer details
+
+## Backend libraries
+
+The Hexagon backend consist of two parts:
+
+  - `libggml-hexagon`
+    This is the regular CPU-side GGML backend library, either shared or statically linked
+
+  - `libggml-htp-vNN`
+    This is the NPU-side (HTP stands for Hexagon Tensor Processor) shared library that contains the Op dispatcher and kernels.
+    The correct library is selected automatically at runtime based on the HW version.
+
+Here is an example of the build artifacts
+
+```
+~/src/llama.cpp$ ls -l pkg-adb/llama.cpp/lib/libggml*
+pkg-adb/llama.cpp/lib/libggml-base.so
+pkg-adb/llama.cpp/lib/libggml-cpu.so
+pkg-adb/llama.cpp/lib/libggml-hexagon.so      <<< CPU library
+pkg-adb/llama.cpp/lib/libggml-htp-v73.so      <<< HTP op/kernels for Hexagon v73
+pkg-adb/llama.cpp/lib/libggml-htp-v75.so
+pkg-adb/llama.cpp/lib/libggml-htp-v79.so
+pkg-adb/llama.cpp/lib/libggml-htp-v81.so
+```
+
+## Memory buffers
+
+Hexagon NPU backend takes advantage of the Snapdragon's unified memory model where all buffers are fully accessible by the CPU and GPU.
+The NPU does have a dedicated tightly-coupled memory called VTCM but that memory is used only for intermediate data (e.g. dynamically
+quantized tensors) or temporary data (chunks of the weight tensors fetched via DMA).
+
+Please note that currently the Hexagon backend does not implement SET/GET_ROWS Ops because there is no advantage in offloading those
+to the NPU at this point.
+
+The backend does allocates non-host buffers for the tensors with datatypes that require repacking: Q4_0, Q8_0, MXFP4.
+From the MMU perspective these buffers are still regular buffers (normal access by the CPU) they are marked as non-host simply to force
+the repacking.
+
+## Large model handling
+
+Hexagon NPU session (aka Process Domain (PD) in the Hexagon docs) is limited to a memory mapping of around 3.5GB.
+In llama.cpp/GGML the Hexagon session is mapped to a single GGML backend device (HTP0, HTP1, etc).
+
+In order to map models larger than 3.5GB we need to allocate multiple devices and split the model.
+For this we're taking advantage of the llama.cpp/GGML multi-GPU layer-splitting support.
+Each Hexagon device behaves like a GPU from the offload and model splitting perspective.
+
+Here is an example of running GPT-OSS-20B model on a newer Snapdragon device with 16GB of DDR.
+
+```
+M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapdragon/adb/run-cli.sh -no-cnv -f surfing.txt -n 32
+...
+LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
+GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
+      -t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
+...
+llama_model_loader: - type  f32:  289 tensors
+llama_model_loader: - type q4_0:   96 tensors
+llama_model_loader: - type q8_0:    2 tensors
+llama_model_loader: - type mxfp4:  72 tensors
+...
+load_tensors: offloaded 25/25 layers to GPU
+load_tensors:          CPU model buffer size =  1182.09 MiB
+load_tensors:         HTP1 model buffer size =     6.64 MiB
+load_tensors:  HTP1-REPACK model buffer size =  2505.94 MiB
+load_tensors:         HTP3 model buffer size =     5.55 MiB
+load_tensors:  HTP3-REPACK model buffer size =  2088.28 MiB
+load_tensors:         HTP0 model buffer size =     7.75 MiB
+load_tensors:  HTP0-REPACK model buffer size =  2923.59 MiB
+load_tensors:         HTP2 model buffer size =     6.64 MiB
+load_tensors:  HTP2-REPACK model buffer size =  2505.94 MiB
+...
+llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
+llama_context:        CPU  output buffer size =     0.77 MiB
+llama_kv_cache_iswa: creating non-SWA KV cache, size = 8192 cells
+llama_kv_cache:       HTP1 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP3 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP0 KV buffer size =    25.50 MiB
+llama_kv_cache:       HTP2 KV buffer size =    25.50 MiB
+llama_kv_cache: size =  102.00 MiB (  8192 cells,  12 layers,  1/1 seqs), K (q8_0):   51.00 MiB, V (q8_0):   51.00 MiB
+llama_kv_cache_iswa: creating     SWA KV cache, size = 256 cells
+llama_kv_cache:       HTP1 KV buffer size =     0.80 MiB
+llama_kv_cache:       HTP3 KV buffer size =     0.53 MiB
+llama_kv_cache:       HTP0 KV buffer size =     1.06 MiB
+llama_kv_cache:       HTP2 KV buffer size =     0.80 MiB
+llama_kv_cache: size =    3.19 MiB (   256 cells,  12 layers,  1/1 seqs), K (q8_0):    1.59 MiB, V (q8_0):    1.59 MiB
+llama_context:       HTP0 compute buffer size =    16.06 MiB
+llama_context:       HTP1 compute buffer size =    16.06 MiB
+llama_context:       HTP2 compute buffer size =    16.06 MiB
+llama_context:       HTP3 compute buffer size =    16.06 MiB
+llama_context:        CPU compute buffer size =    98.19 MiB
+...
+llama_perf_context_print: prompt eval time =    3843.67 ms /   197 tokens ( 19.51 ms per token, 51.25 tokens per second)
+llama_perf_context_print:        eval time =    1686.13 ms /    31 runs   ( 54.39 ms per token, 18.39 tokens per second)
+llama_perf_context_print:       total time =    6266.30 ms /   228 tokens
+llama_perf_context_print:    graphs reused =         30
+llama_memory_breakdown_print: | memory breakdown [MiB] | total   free    self   model   context   compute    unaccounted |
+llama_memory_breakdown_print: |   - HTP0 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP1 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP2 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - HTP3 (Hexagon)     |  2048 = 2048 + (   0 =     0 +       0 +       0) +           0 |
+llama_memory_breakdown_print: |   - Host               |                 1476 =  1208 +     105 +     162                |
+llama_memory_breakdown_print: |   - HTP1-REPACK        |                 2505 =  2505 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP3-REPACK        |                 2088 =  2088 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP0-REPACK        |                 2923 =  2923 +       0 +       0                |
+llama_memory_breakdown_print: |   - HTP2-REPACK        |                 2505 =  2505 +       0 +       0                |
+```
--- a/docs/backend/zDNN.md
+++ b/docs/backend/zDNN.md
@@ -0,0 +1,61 @@
+# llama.cpp for IBM zDNN Accelerator
+
+## Background
+
+IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
+
+### Llama.cpp + IBM zDNN
+
+The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
+
+## Software & Hardware Support
+
+| Hardware Level       | Status        | Verified                   |
+| -------------------- | ------------- | -------------------------- |
+| IBM z17 / LinuxONE 5 | Supported     | RHEL 9.6, IBM z17, 40 IFLs |
+| IBM z16 / LinuxONE 4 | Not Supported |                            |
+
+## Data Types Supported
+
+| Data Type | Status    |
+| --------- | --------- |
+| F32       | Supported |
+| F16       | Supported |
+| BF16      | Supported |
+
+## CMake Options
+
+The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
+
+| CMake Option | Default Value | Description                         |
+| ------------ | ------------- | ----------------------------------- |
+| `GGML_ZDNN`  | `OFF`         | Compile llama.cpp with zDNN support |
+| `ZDNN_ROOT`  | `""`          | Override zDNN library lookup        |
+
+## 1. Install zDNN Library
+
+Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
+
+```sh
+git clone --recurse-submodules https://github.com/IBM/zDNN
+cd zDNN
+
+autoreconf .
+./configure --prefix=/opt/zdnn-libs
+
+make build
+sudo make install
+```
+
+## 2. Build llama.cpp
+
+```sh
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+
+cmake -S . -G Ninja -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_ZDNN=ON \
+    -DZDNN_ROOT=/opt/zdnn-libs
+cmake --build build --config Release -j$(nproc)
+```
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -0,0 +1,89 @@
+> [!IMPORTANT]
+> This build documentation is specific only to RISC-V SpacemiT SOCs.
+
+## Build llama.cpp locally (for riscv64)
+
+1. Prepare Toolchain For RISCV
+~~~
+wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
+~~~
+
+2. Build
+Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
+```bash
+
+cmake -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_CPU_RISCV64_SPACEMIT=ON \
+    -DLLAMA_CURL=OFF \
+    -DGGML_RVV=ON \
+    -DGGML_RV_ZFH=ON \
+    -DGGML_RV_ZICBOP=ON \
+    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
+    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
+    -DCMAKE_INSTALL_PREFIX=build/installed
+
+cmake --build build --parallel $(nproc) --config Release
+
+pushd build
+make install
+popd
+```
+
+## Simulation
+You can use QEMU to perform emulation on non-RISC-V architectures.
+
+1. Download QEMU
+~~~
+wget https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz
+~~~
+
+2. Run Simulation
+After build your llama.cpp, you can run the executable file via QEMU for simulation, for example:
+~~~
+export QEMU_ROOT_PATH={your QEMU file path}
+export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
+
+${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
+~~~
+## Performance
+#### Quantization Support For Matrix
+~~~
+model name      : Spacemit(R) X60
+isa             : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
+mmu             : sv39
+uarch           : spacemit,x60
+mvendorid       : 0x710
+marchid         : 0x8000000058000001
+~~~
+
+Q4_0
+|   Model    |   Size   | Params | backend | threads | test | t/s |
+| -----------| -------- | ------ | ------- | ------- | ---- |------|
+Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | pp512|64.12 ± 0.26|
+Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | tg128|10.03 ± 0.01|
+Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | pp512|24.16 ± 0.02|
+Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | tg128|3.83 ± 0.06|
+Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | pp512|12.08 ± 0.02|
+Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | tg128|2.23 ± 0.02|
+
+Q4_1
+|   Model    |   Size   | Params | backend | threads | test | t/s |
+| -----------| -------- | ------ | ------- | ------- | ---- |------|
+Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | pp512|62.07 ± 0.12|
+Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | tg128|9.91 ± 0.01|
+Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | pp512|22.95 ± 0.25|
+Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | tg128|4.01 ± 0.15|
+Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | pp512|11.55 ± 0.16|
+Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | tg128|2.25 ± 0.04|
+
+
+Q4_K
+|   Model    |   Size   | Params | backend | threads | test | t/s |
+| -----------| -------- | ------ | ------- | ------- | ---- |------|
+Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | pp512|9.29 ± 0.05|
+Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | tg128|5.67 ± 0.04|
+Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | pp512|10.38 ± 0.10|
+Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | tg128|3.17 ± 0.08|
+Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | pp512|4.23 ± 0.04|
+Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | tg128|1.73 ± 0.00|
--- a/docs/build.md
+++ b/docs/build.md
@@ -178,6 +178,48 @@ GeForce RTX 3070      8.6
 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
 ```

+### Overriding the CUDA Version
+
+If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+```
+
+#### Fixing Compatibility Issues with Old CUDA and New glibc
+
+If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
+
+```
+/usr/include/bits/mathcalls.h(83): error: exception specification is
+  incompatible with that of previous function "cospi"
+
+
+  /opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
+  here
+```
+
+It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
+Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
+
+```C++
+// original lines
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
+
+// edited lines
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x) noexcept (true);
+```
+
 ### Runtime CUDA environmental variables

 You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
@@ -261,10 +303,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```

+  Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system.
+
  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.

  The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
@@ -282,17 +326,17 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build -- -j 16
  ```

 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake -S . -B build -G Ninja -DGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
  cmake --build build
  ```
-  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
+  If necessary, adapt `GPU_TARGETS` to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.


--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,9 +7,9 @@
 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)

 Additionally, there the following images, similar to the above:

@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc4.2.0`
+- `MUSA_VERSION` set to `rc4.3.0`

 The resulting images, are essentially the same as the non-MUSA images:

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -14,97 +14,108 @@ Legend:

 | Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
 |-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
 |               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
 |                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
 |                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
 |                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
 |                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
+|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
 |                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
--- a/docs/ops/Vulkan.csv
+++ b/docs/ops/Vulkan.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,6 @@ else()

    add_subdirectory(gguf-hash)
    add_subdirectory(gguf)
-    add_subdirectory(gritlm)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(parallel)
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
-USER_NAME="${USER_NAME:-Anon}"
-
-# Uncomment and adjust to the number of CPU cores you want to use.
-#N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
-N_PREDICTS="${N_PREDICTS:-4096}"
-
-GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
-
-if [ -n "$N_THREAD" ]; then
-    GEN_OPTIONS+=(--threads "$N_THREAD")
-fi
-
-./llama-cli "${GEN_OPTIONS[@]}" \
-    --model "$MODEL" \
-    --in-prefix " " \
-    --in-suffix "${AI_NAME}:" \
-    --n_predict "$N_PREDICTS" \
-    --color --interactive \
-    --reverse-prompt "${USER_NAME}:" \
-    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
-${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
-${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
-${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
-${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}.
-The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
-${AI_NAME} can only communicate through text, so she can't send images or videos.
-
-
-${USER_NAME}: Hello!
-${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
-${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
-${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
-${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
-${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
-${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
-${AI_NAME}: What do you like to do in your free time? ^_^
-${USER_NAME}:" "$@"
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt

 ```bash
-./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
+./llama-batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4 --kv-unified

 ...

--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -1,57 +0,0 @@
-@setlocal disabledelayedexpansion enableextensions
-@echo off
-
-cd /d "%~dp0.."
-if not "%errorlevel%"=="0" (
-    echo Unable to change directory.
-    pause
-    exit /b 1
-)
-
-if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
-if not defined USER_NAME set "USER_NAME=User"
-if not defined AI_NAME set "AI_NAME=ChatLLaMa"
-rem Adjust to the number of CPU cores you want to use.
-rem if not defined N_THREAD set "N_THREAD=8"
-rem Number of tokens to predict (made it larger than default because we want a long interaction)
-if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
-
-rem Default main script paths
-set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
-
-rem Get main script path from command line arguments
-set "MAIN_SCRIPT_PATH=%~1"
-
-rem If the main script path was not specified, try the default paths
-if not defined MAIN_SCRIPT_PATH (
-    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
-        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
-    )
-)
-
-rem If the main script path was not found, tell the user how to specify it
-if not defined MAIN_SCRIPT_PATH (
-    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
-    echo %DEFAULT_MAIN_SCRIPT_PATHS%
-    pause
-    exit /b 1
-)
-
-rem Default context, feel free to edit it
-set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
-
-rem Set a temporary variable if N_THREAD is set
-if defined N_THREAD (
-    set "_N_THREAD=--threads %N_THREAD%"
-) else (
-    set "_N_THREAD="
-)
-
-rem Run the script
-echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
-  --model "%MODEL%" ^
-  --n_predict %N_PREDICTS% ^
-  --color --interactive ^
-  --reverse-prompt "%USER_NAME%:" ^
-  --prompt "%PROMPT_TEXT%"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="${USER_NAME:-USER}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "${USER_NAME}:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -1,149 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-cd "$(dirname "$0")/.." || exit
-
-if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
-    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
-    exit 1
-fi
-
-MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
-PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
-USER_NAME="${USER_NAME:-User}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-DATE_TIME="$(date +%H:%M)"
-DATE_YEAR="$(date +%Y)"
-
-LOG="${CHAT_SAVE_DIR}/main.log"
-LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
-CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
-CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
-NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
-NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
-
-SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
-'|'\
-'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
-SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
-
-CTX_SIZE=2048
-CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
-OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
-
-# An unbuffered `tail -c+N`
-skip_bytes() {
-    LANG=C IFS= read -r -n "$1" -d '' c
-    while LANG=C IFS= read -r -n 1 -d '' c; do
-        printf '%s' "$c"
-    done
-}
-
-mkdir -p "$CHAT_SAVE_DIR"
-echo >"$LOG"
-trap "tail -n100 ${LOG}" EXIT
-
-if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
-    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
-        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
-fi
-
-if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
-    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-fi
-
-if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
-    echo '...' >>"$NEXT_PROMPT_FILE"
-fi
-
-if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
-    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama-cli 2>>"$LOG" \
-        --batch_size 64 \
-        "${OPTS[@]}" \
-        --prompt-cache "$PROMPT_CACHE_FILE" \
-        --file "$CUR_PROMPT_FILE" \
-        --n_predict 1
-    echo
-    echo 'Done!'
-fi
-
-if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
-fi
-if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-fi
-
-printf '%s ' "$(< "$CUR_PROMPT_FILE")"
-n_tokens=0
-
-while read -e line; do
-    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
-    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
-
-    # Swap prompts when we're about to run out of context
-    if ((n_predict <= 0)); then
-        wait # for background main (below) to finish with next prompt
-        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
-        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
-
-        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-        echo '...' >>"$NEXT_PROMPT_FILE"
-        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-
-        n_tokens=0
-        n_predict=$((CTX_SIZE / 2))
-    fi
-
-    echo " ${line}" >>"$CUR_PROMPT_FILE"
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        echo " ${line}" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
-
-    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
-
-    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
-            --prompt-cache "$CUR_PROMPT_CACHE" \
-            --prompt-cache-all \
-            --file "$CUR_PROMPT_FILE" \
-            --reverse-prompt "${USER_NAME}:" \
-            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
-        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
-        skip_bytes "$n_prompt_len_pre"  # print generation
-
-    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
-
-    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
-    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
-        printf '\n%s:' "$USER_NAME"
-        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
-    fi
-
-    printf ' '
-
-    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
-        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
-        exit 1
-    fi
-
-    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
-
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    # Update cache for next prompt in background, ideally during user input
-    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
-          --prompt-cache "$NEXT_PROMPT_CACHE" \
-          --file "$NEXT_PROMPT_FILE" \
-          --n_predict 1 &
-done
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="### Human"
-AI_NAME="### Assistant"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "### Human:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-# Important:
-#
-#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
-#
-./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
-    --repeat_penalty 1.0 --color -i \
-    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -6,8 +6,54 @@ More Info:
 - https://github.com/ggml-org/llama.cpp/pull/14644
 - https://github.com/ggml-org/llama.cpp/pull/14771

+## Parameters
+The diffusion CLI supports various parameters to control the generation process:

-Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual`
+### Core Diffusion Parameters
+- `--diffusion-steps`: Number of diffusion steps (default: 256)
+- `--diffusion-algorithm`: Algorithm for token selection
+  - `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
+  - `1`: ENTROPY_BASED - Entropy-based selection
+  - `2`: MARGIN_BASED - Margin-based selection
+  - `3`: RANDOM - Random selection
+  - `4`: CONFIDENCE_BASED - Confidence-based selection (default)
+  - More documentation here https://github.com/DreamLM/Dream
+- `--diffusion-visual`: Enable live visualization during generation

-Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual`
+### Scheduling Parameters
+Choose one of the following scheduling methods:

+**Timestep-based scheduling:**
+- `--diffusion-eps`: Epsilon value for timestep scheduling (e.g., 0.001)
+
+**Block-based scheduling:**
+- `--diffusion-block-length`: Block size for block-based scheduling (e.g., 32)
+
+### Sampling Parameters
+- `--temp`: Temperature for sampling (0.0 = greedy/deterministic, higher = more random)
+- `--top-k`: Top-k filtering for sampling
+- `--top-p`: Top-p (nucleus) filtering for sampling
+- `--seed`: Random seed for reproducibility
+
+### Model Parameters
+- `-m`: Path to the GGUF model file
+- `-p`: Input prompt text
+- `-ub`: Maximum sequence length (ubatch size)
+- `-c`: Context size
+- `-b`: Batch size
+
+### Examples
+#### Dream architechture:
+```
+llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
+```
+
+#### LLaDA architechture:
+```
+llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
+```
+
+#### RND1 architecture:
+```
+llama-diffusion-cli -m RND1-Base-0910.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-algorithm 1 --diffusion-steps 256 --diffusion-visual --temp 0.5 --diffusion-eps 0.001
+```
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -38,13 +38,14 @@ The above command will output space-separated float values.
 |            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
 | 'json'     | openai style                 |
 | 'json+'    | add cosine similarity matrix |
+| 'raw'      | plain text output            |

 ### --embd-separator $"string"$
 | $"string"$   | |
 |--------------|-|
 | "\n"         | (default)
-| "<#embSep#>" | for exemple
-| "<#sep#>"    | other exemple
+| "<#embSep#>" | for example
+| "<#sep#>"    | other example

 ## examples
 ### Unix-based systems (Linux, macOS, etc.):
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -70,6 +70,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    }
 }

+// plain, pipe-friendly output: one embedding per line
+static void print_raw_embeddings(const float * emb,
+                                 int n_embd_count,
+                                 int n_embd,
+                                 const llama_model * model,
+                                 enum llama_pooling_type pooling_type,
+                                 int embd_normalize) {
+    const uint32_t n_cls_out = llama_model_n_cls_out(model);
+    const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
+    const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
+
+    for (int j = 0; j < n_embd_count; ++j) {
+        for (int i = 0; i < cols; ++i) {
+            if (embd_normalize == 0) {
+                LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
+            } else {
+                LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
+            }
+        }
+        LOG("\n");
+    }
+}
+
 int main(int argc, char ** argv) {
    common_params params;

@@ -81,12 +104,16 @@ int main(int argc, char ** argv) {

    params.embedding = true;

+    // get max number of sequences per batch
+    const int n_seq_max = llama_max_parallel_sequences();
+
    // if the number of prompts that would be encoded is known in advance, it's more efficient to specify the
    //   --parallel argument accordingly. for convenience, if not specified, we fallback to unified KV cache
    //   in order to support any number of prompts
    if (params.n_parallel == 1) {
        LOG_INF("%s: n_parallel == 1 -> unified KV cache is enabled\n", __func__);
        params.kv_unified = true;
+        params.n_parallel = n_seq_max;
    }

    // utilize the full context
@@ -95,8 +122,10 @@ int main(int argc, char ** argv) {
        params.n_batch = params.n_ctx;
    }

-    // For non-causal models, batch size must be equal to ubatch size
-    params.n_ubatch = params.n_batch;
+    // for non-causal models, batch size must be equal to ubatch size
+    if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
+        params.n_ubatch = params.n_batch;
+    }

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -144,6 +173,7 @@ int main(int argc, char ** argv) {
    // get added sep and eos token, if any
    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
+    const char * rerank_prompt = llama_model_chat_template(model, "rerank");

    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
@@ -153,21 +183,28 @@ int main(int argc, char ** argv) {
        // split classification pairs and insert expected separator tokens
        if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
            std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
-            std::string final_prompt;
-
-            for (size_t i = 0; i < pairs.size(); i++) {
-                final_prompt += pairs[i];
-                if (i != pairs.size() - 1) {
-                    if (!added_eos_token.empty()) {
-                        final_prompt += added_eos_token;
-                    }
-                    if (!added_sep_token.empty()) {
-                        final_prompt += added_sep_token;
+            if (rerank_prompt != nullptr) {
+                const std::string query = pairs[0];
+                const std::string doc = pairs[1];
+                std::string final_prompt = rerank_prompt;
+                string_replace_all(final_prompt, "{query}"   , query);
+                string_replace_all(final_prompt, "{document}", doc  );
+                inp = common_tokenize(vocab, final_prompt, true, true);
+            } else {
+                std::string final_prompt;
+                for (size_t i = 0; i < pairs.size(); i++) {
+                    final_prompt += pairs[i];
+                    if (i != pairs.size() - 1) {
+                        if (!added_eos_token.empty()) {
+                            final_prompt += added_eos_token;
+                        }
+                        if (!added_sep_token.empty()) {
+                            final_prompt += added_sep_token;
+                        }
                    }
                }
+                inp = common_tokenize(ctx, final_prompt, true, true);
            }
-
-            inp = common_tokenize(ctx, final_prompt, true, true);
        } else {
            inp = common_tokenize(ctx, prompt, true, true);
        }
@@ -229,7 +266,7 @@ int main(int argc, char ** argv) {
        const uint64_t n_toks = inp.size();

        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch) {
+        if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
            float * out = emb + e * n_embd;
            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
@@ -359,6 +396,8 @@ int main(int argc, char ** argv) {
        }

        if (notArray) LOG("\n}\n");
+    } else if (params.embd_out == "raw") {
+        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
    }

    LOG("\n");
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -5,6 +5,11 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET}
-        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        add_test(NAME ${TEST_TARGET}
+                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+else()
+        add_test(NAME ${TEST_TARGET}
+                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
+endif()
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -4,10 +4,10 @@
 #include "llama.h"
 #include "ggml.h"

+#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <numeric>

 /**
 * This the arbitrary data which will be passed to each callback.
@@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
    return u.f;
 }

-static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
+static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
    float v;
    if (type == GGML_TYPE_F16) {
-        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+        v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
    } else if (type == GGML_TYPE_F32) {
-        v = *(float *) &data[i];
+        v = *(const float *) &data[i];
    } else if (type == GGML_TYPE_I64) {
-        v = (float) *(int64_t *) &data[i];
+        v = (float) *(const int64_t *) &data[i];
    } else if (type == GGML_TYPE_I32) {
-        v = (float) *(int32_t *) &data[i];
+        v = (float) *(const int32_t *) &data[i];
    } else if (type == GGML_TYPE_I16) {
-        v = (float) *(int16_t *) &data[i];
+        v = (float) *(const int16_t *) &data[i];
    } else if (type == GGML_TYPE_I8) {
-        v = (float) *(int8_t *) &data[i];
+        v = (float) *(const int8_t *) &data[i];
    } else if (type == GGML_TYPE_BF16) {
-        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
+        v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
    } else {
        GGML_ABORT("fatal error");
    }
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -184,8 +184,13 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t size   = gguf_get_tensor_size  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
+            const auto   type   = gguf_get_tensor_type  (ctx, i);

-            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
+            const char * type_name  = ggml_type_name(type);
+            const size_t type_size  = ggml_type_size(type);
+            const size_t n_elements = size / type_size;
+
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu, type = %s, n_elts = %zu\n", __func__, i, name, size, offset, type_name, n_elements);
        }
    }

--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-gritlm)
-add_executable(${TARGET} gritlm.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -1,62 +0,0 @@
-## Generative Representational Instruction Tuning (GRIT) Example
-[gritlm] a model which can generate embeddings as well as "normal" text
-generation depending on the instructions in the prompt.
-
-* Paper: https://arxiv.org/pdf/2402.09906.pdf
-
-### Retrieval-Augmented Generation (RAG) use case
-One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
-that we take documents that we want to use as context, to ground the large
-language model (LLM), and we create token embeddings for them. We then store
-these token embeddings in a vector database.
-
-When we perform a query, prompt the LLM, we will first create token embeddings
-for the query and then search the vector database to retrieve the most
-similar vectors, and return those documents so they can be passed to the LLM as
-context. Then the query and the context will be passed to the LLM which will
-have to _again_ create token embeddings for the query. But because gritlm is used
-the first query can be cached and the second query tokenization generation does
-not have to be performed at all.
-
-### Running the example
-Download a Grit model:
-```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
-```
-
-Run the example using the downloaded model:
-```console
-$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
-
-Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
-Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
-Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
-Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
-
-Oh, brave adventurer, who dared to climb
-The lofty peak of Mt. Fuji in the night,
-When shadows lurk and ghosts do roam,
-And darkness reigns, a fearsome sight.
-
-Thou didst set out, with heart aglow,
-To conquer this mountain, so high,
-And reach the summit, where the stars do glow,
-And the moon shines bright, up in the sky.
-
-Through the mist and fog, thou didst press on,
-With steadfast courage, and a steadfast will,
-Through the darkness, thou didst not be gone,
-But didst climb on, with a steadfast skill.
-
-At last, thou didst reach the summit's crest,
-And gazed upon the world below,
-And saw the beauty of the night's best,
-And felt the peace, that only nature knows.
-
-Oh, brave adventurer, who dared to climb
-The lofty peak of Mt. Fuji in the night,
-Thou art a hero, in the eyes of all,
-For thou didst conquer this mountain, so bright.
-```
-
-[gritlm]: https://github.com/ContextualAI/gritlm
--- a/Show More
+++ b/Show More