metal : back to a single queue per device

ggml-ci
metal : clean-up loose ends, ready for tests
2026-04-23 16:37:33 +03:00 · 2025-09-09 17:06:46 +03:00 · 2025-09-09 15:01:21 +03:00 · 2025-09-09 14:29:54 +03:00 · 2025-09-09 14:28:34 +03:00 · 2025-09-09 14:01:15 +03:00
891 changed files with 38390 additions and 80234 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -22,13 +22,6 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
-# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
-AttributeMacros:
-  - __host__
-  - __device__
-  - __global__
-  - __forceinline__
-  - __launch_bounds__
 BinPackArguments: true
 BinPackParameters: false # OnePerLine
 BitFieldColonSpacing: Both
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -17,7 +17,6 @@ Checks: >
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
-    -performance-enum-size,
    portability-*,
    -portability-simd-intrinsics,
    misc-*,
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,8 +1,8 @@
-ARG ONEAPI_VERSION=2025.2.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04

 ## Build Image

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@@ -31,7 +31,7 @@ RUN mkdir -p /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh

-FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base

 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.3.0
+ARG MUSA_VERSION=rc4.2.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64

--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -128,6 +128,10 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  };

  postPatch = ''
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
+      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';

  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -1,10 +1,10 @@
 ARG UBUNTU_VERSION=24.04

 # This needs to generally match the container host's environment.
-ARG ROCM_VERSION=7.0
-ARG AMDGPU_VERSION=7.0
+ARG ROCM_VERSION=6.4
+ARG AMDGPU_VERSION=6.4

-# Target the ROCm build image
+# Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ### Build image
@@ -13,14 +13,18 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
-# gfx803, gfx900, gfx906, gfx1032, gfx1101, gfx1102,not officialy supported
-# check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
+# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
+# gfx906 is deprecated
+#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html

-ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
-#ARG ROCM_DOCKER_ARCH='gfx1151'
+ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
+#ARG ROCM_DOCKER_ARCH=gfx1100

-# Set ROCm architectures
+# Set nvcc architectured
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+# ENV CC=/opt/rocm/llvm/bin/clang
+# ENV CXX=/opt/rocm/llvm/bin/clang++

 RUN apt-get update \
    && apt-get install -y \
@@ -36,12 +40,7 @@ WORKDIR /app
 COPY . .

 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build \
-        -DGGML_HIP=ON \
-        -DGGML_HIP_ROCWMMA_FATTN=ON \
-        -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
-        -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
-        -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
+    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -1,123 +0,0 @@
-ARG GCC_VERSION=15.2.0
-ARG UBUNTU_VERSION=24.04
-
-### Build Llama.cpp stage
-FROM gcc:${GCC_VERSION} AS build
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt upgrade -y && \
-    apt install -y --no-install-recommends \
-        git cmake ccache ninja-build \
-        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        libopenblas-dev libcurl4-openssl-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-COPY . .
-
-RUN --mount=type=cache,target=/root/.ccache \
-    --mount=type=cache,target=/app/build \
-    cmake -S . -B build -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-        -DLLAMA_BUILD_TESTS=OFF \
-        -DGGML_BACKEND_DL=OFF \
-        -DGGML_NATIVE=OFF \
-        -DGGML_BLAS=ON \
-        -DGGML_BLAS_VENDOR=OpenBLAS && \
-    cmake --build build --config Release -j $(nproc) && \
-    cmake --install build --prefix /opt/llama.cpp
-
-COPY *.py             /opt/llama.cpp/bin
-COPY .devops/tools.sh /opt/llama.cpp/bin
-
-COPY gguf-py          /opt/llama.cpp/gguf-py
-COPY requirements.txt /opt/llama.cpp/gguf-py
-COPY requirements     /opt/llama.cpp/gguf-py/requirements
-
-
-### Collect all llama.cpp binaries, libraries and distro libraries
-FROM scratch AS collector
-
-# Copy llama.cpp binaries and libraries
-COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
-COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
-COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
-
-
-### Base image
-FROM ubuntu:${UBUNTU_VERSION} AS base
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt install -y --no-install-recommends \
-        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
-        curl libgomp1 libopenblas-dev && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-# Copy llama.cpp libraries
-COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
-
-
-### Full
-FROM base AS full
-
-ENV PATH="/root/.cargo/bin:${PATH}"
-WORKDIR /app
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt install -y \
-        git cmake libjpeg-dev \
-        python3 python3-pip python3-dev && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
-
-COPY --from=collector /llama.cpp/bin /app
-COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
-
-RUN pip install --no-cache-dir --break-system-packages \
-        -r /app/gguf-py/requirements.txt
-
-ENTRYPOINT [ "/app/tools.sh" ]
-
-
-### CLI Only
-FROM base AS light
-
-WORKDIR /llama.cpp/bin
-
-# Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
-
-ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
-
-
-### Server
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-WORKDIR /llama.cpp/bin
-
-# Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
-
-EXPOSE 8080
-
-ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
--- a/.editorconfig
+++ b/.editorconfig
@@ -52,11 +52,3 @@ insert_final_newline = unset
 [vendor/miniaudio/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
-
-[tools/server/webui/**]
-indent_style = unset
-indent_size = unset
-end_of_line = unset
-charset = unset
-trim_trailing_whitespace = unset
-insert_final_newline = unset
--- a/.github/actions/install-exe/action.yml
+++ b/.github/actions/install-exe/action.yml
@@ -1,36 +0,0 @@
-name: "Install exe"
-description: "Download and install exe"
-inputs:
-  url:
-    description: "URL of the exe installer"
-    required: true
-  args:
-    description: "Installer arguments"
-    required: true
-  timeout:
-    description: "Timeout (in ms)"
-    required: false
-    default: "600000"
-
-runs:
-  using: "composite"
-  steps:
-    - name: Install EXE
-      shell: pwsh
-      run: |
-        $ErrorActionPreference = "Stop"
-        write-host "Downloading Installer EXE"
-        Invoke-WebRequest -Uri "${{ inputs.url }}" -OutFile "${env:RUNNER_TEMP}\temp-install.exe"
-        write-host "Installing"
-        $proc = Start-Process "${env:RUNNER_TEMP}\temp-install.exe" -ArgumentList '${{ inputs.args }}' -NoNewWindow -PassThru
-        $completed = $proc.WaitForExit(${{ inputs.timeout }})
-        if (-not $completed) {
-            Write-Error "Installer timed out. Killing the process"
-            $proc.Kill()
-            exit 1
-        }
-        if ($proc.ExitCode -ne 0) {
-            Write-Error "Installer failed with exit code $($proc.ExitCode)"
-            exit 1
-        }
-        write-host "Completed installation"
--- a/.github/actions/linux-setup-spacemit/action.yml
+++ b/.github/actions/linux-setup-spacemit/action.yml
@@ -1,20 +0,0 @@
-name: "Linux - Setup SpacemiT Toolchain"
-description: "Setup SpacemiT Toolchain for Linux"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version:
-    description: "SpacemiT toolchain version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup SpacemiT Toolchain
-      id: setup
-      uses: ./.github/actions/unarchive-tar
-      with:
-        url: https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v${{ inputs.version }}.tar.xz
-        path: ${{ inputs.path }}
-        strip: 1
--- a/.github/actions/linux-setup-vulkan/action.yml
+++ b/.github/actions/linux-setup-vulkan/action.yml
@@ -1,20 +0,0 @@
-name: "Linux - Setup Vulkan SDK"
-description: "Setup Vulkan SDK for Linux"
-inputs:
-  path:
-    description: "Installation path"
-    required: true
-  version:
-    description: "Vulkan SDK version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup Vulkan SDK
-      id: setup
-      uses: ./.github/actions/unarchive-tar
-      with:
-        url: https://sdk.lunarg.com/sdk/download/${{ inputs.version }}/linux/vulkan_sdk.tar.xz
-        path: ${{ inputs.path }}
-        strip: 1
--- a/.github/actions/unarchive-tar/action.yml
+++ b/.github/actions/unarchive-tar/action.yml
@@ -1,27 +0,0 @@
-name: "Unarchive tar"
-description: "Download and unarchive tar into directory"
-inputs:
-  url:
-    description: "URL of the tar archive"
-    required: true
-  path:
-    description: "Directory to unarchive into"
-    required: true
-  type:
-    description: "Compression type (tar option)"
-    required: false
-    default: "J"
-  strip:
-    description: "Strip components"
-    required: false
-    default: "0"
-
-runs:
-  using: "composite"
-  steps:
-    - name: Unarchive into directory
-      shell: bash
-      run: |
-        mkdir -p ${{ inputs.path }}
-        cd ${{ inputs.path }}
-        curl --no-progress-meter ${{ inputs.url }} | tar -${{ inputs.type }}x --strip-components=${{ inputs.strip }}
--- a/.github/actions/windows-setup-rocm/action.yml
+++ b/.github/actions/windows-setup-rocm/action.yml
@@ -1,15 +0,0 @@
-name: "Windows - Setup ROCm"
-description: "Setup ROCm for Windows"
-inputs:
-  version:
-    description: "ROCm version"
-    required: true
-
-runs:
-  using: "composite"
-  steps:
-    - name: Setup ROCm
-      uses: ./.github/actions/install-exe
-      with:
-        url: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ inputs.version }}-WinSvr2022-For-HIP.exe
-        args: -install
--- a/.github/workflows/build-amd.yml
+++ b/.github/workflows/build-amd.yml
@@ -1,52 +0,0 @@
-name: CI (AMD)
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: [
-      '.github/workflows/build-amd.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.comp'
-    ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ggml-ci-x64-amd-vulkan:
-    runs-on: [self-hosted, Linux, X64, AMD]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-amd-rocm:
-    runs-on: [self-hosted, Linux, X64, AMD]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          amd-smi static
-          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
--- a/.github/workflows/build-cache.yml
+++ b/.github/workflows/build-cache.yml
@@ -1,89 +0,0 @@
-name: Build Actions Cache
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    - cron: '0 * * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  ubuntu-24-vulkan-cache:
-    runs-on: ubuntu-24.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Setup Cache
-        uses: actions/cache@v4
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
-
-  ubuntu-24-spacemit-cache:
-    runs-on: ubuntu-24.04
-
-    env:
-      # Make sure this is in sync with build-linux-cross.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Cache
-        uses: actions/cache@v4
-        id: cache-toolchain
-        with:
-          path: ./spacemit_toolchain
-          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
-
-      - name: Setup SpacemiT Toolchain
-        if: steps.cache-toolchain.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-spacemit
-        with:
-          path: ./spacemit_toolchain
-          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
-
-  windows-2022-rocm-cache:
-    runs-on: windows-2022
-
-    env:
-      # Make sure this is in sync with build.yml
-      HIPSDK_INSTALLER_VERSION: "25.Q3"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Setup Cache
-        uses: actions/cache@v4
-        id: cache-rocm
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Setup ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-rocm
-        with:
-          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -141,6 +141,97 @@ jobs:

  #         cmake --build build --config Release -j $(nproc)

+  ubuntu-24-ppc64el-cpu-cross:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup PowerPC64le
+        run: |
+          sudo dpkg --add-architecture ppc64el
+
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF
+
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-powerpc64le-linux-gnu \
+                  g++-14-powerpc64le-linux-gnu
+
+      - name: Build
+        run: |
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
+                         -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+          cmake --build build --config Release -j $(nproc)
+
+  # ubuntu-24-ppc64el-vulkan-cross:
+  #   runs-on: ubuntu-24.04
+
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - name: Setup PowerPC64le
+  #       run: |
+  #         sudo dpkg --add-architecture ppc64el
+
+  #         # Add arch-specific repositories for non-amd64 architectures
+  #         cat << EOF | sudo tee /etc/apt/sources.list.d/ppc64el-ports.list
+  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+  #         deb [arch=ppc64el] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+  #         EOF
+
+  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+
+  #         sudo apt-get install -y --no-install-recommends \
+  #                 build-essential \
+  #                 glslc \
+  #                 gcc-14-powerpc64le-linux-gnu \
+  #                 g++-14-powerpc64le-linux-gnu \
+  #                 libvulkan-dev:ppc64el
+
+  #     - name: Build
+  #       run: |
+  #         cmake -B build -DLLAMA_CURL=OFF \
+  #                        -DCMAKE_BUILD_TYPE=Release \
+  #                        -DGGML_VULKAN=ON \
+  #                        -DGGML_OPENMP=OFF \
+  #                        -DLLAMA_BUILD_EXAMPLES=ON \
+  #                        -DLLAMA_BUILD_TOOLS=ON \
+  #                        -DLLAMA_BUILD_TESTS=OFF \
+  #                        -DCMAKE_SYSTEM_NAME=Linux \
+  #                        -DCMAKE_SYSTEM_PROCESSOR=ppc64 \
+  #                        -DCMAKE_C_COMPILER=powerpc64le-linux-gnu-gcc-14 \
+  #                        -DCMAKE_CXX_COMPILER=powerpc64le-linux-gnu-g++-14 \
+  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/powerpc64le-linux-gnu \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+  #         cmake --build build --config Release -j $(nproc)
+
  debian-13-loongarch64-cpu-cross:
    runs-on: ubuntu-24.04
    container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
@@ -253,45 +344,3 @@ jobs:
                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-24-riscv64-cpu-spacemit-ime-cross:
-    runs-on: ubuntu-24.04
-
-    env:
-      # Make sure this is in sync with build-cache.yml
-      SPACEMIT_IME_TOOLCHAIN_VERSION: "1.1.2"
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Use SpacemiT Toolchain Cache
-        uses: actions/cache@v4
-        id: cache-toolchain
-        with:
-          path: ./spacemit_toolchain
-          key: spacemit-ime-toolchain-v${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}-${{ runner.os }}
-
-      - name: Setup SpacemiT Toolchain
-        if: steps.cache-toolchain.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-spacemit
-        with:
-          path: ./spacemit_toolchain
-          version: ${{ env.SPACEMIT_IME_TOOLCHAIN_VERSION }}
-
-      - name: Build
-        run: |
-          export RISCV_ROOT_PATH=${PWD}/spacemit_toolchain
-          cmake -B build -DLLAMA_CURL=OFF \
-                         -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TOOLS=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DGGML_CPU_RISCV64_SPACEMIT=ON \
-                         -DGGML_RVV=ON \
-                         -DGGML_RV_ZFH=ON \
-                         -DGGML_RV_ZICBOP=ON \
-                         -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-                         -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
-
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -6,7 +6,7 @@ on:

 jobs:
  debian-13-riscv64-native: # Bianbu 2.2
-    runs-on: [self-hosted, RISCV64]
+    runs-on: self-hosted

    steps:
      - name: Install prerequisites
@@ -58,63 +58,3 @@ jobs:
            -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

          cmake --build build --config Release -j $(nproc)
-
-  # debian-13-riscv64-spacemit-ime-native: # Bianbu 2.2
-  #   runs-on: [self-hosted, RISCV64]
-
-  #   steps:
-  #     - name: Install prerequisites
-  #       run: |
-  #         sudo apt-get update || true
-  #         sudo apt-get install -y libatomic1
-  #     - uses: actions/checkout@v4
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo apt-get update || true
-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu \
-  #                 ccache \
-  #                 cmake
-  #         sudo apt-get upgrade binutils -y
-
-  #     - name: Setup ccache
-  #       run: |
-  #         mkdir -p $HOME/.ccache
-  #         ccache -M 5G -d $HOME/.ccache
-  #         export CCACHE_LOGFILE=/home/runneruser/ccache_debug/ccache.log
-  #         export CCACHE_DEBUGDIR="/home/runneruser/ccache_debug"
-  #         echo "$GITHUB_WORKSPACE"
-  #         echo "CCACHE_LOGFILE=$CCACHE_LOGFILE" >> $GITHUB_ENV
-  #         echo "CCACHE_DEBUGDIR=$CCACHE_DEBUGDIR" >> $GITHUB_ENV
-  #         echo "CCACHE_BASEDIR=$GITHUB_WORKSPACE" >> $GITHUB_ENV
-  #         echo "CCACHE_DIR=$HOME/.ccache" >> $GITHUB_ENV
-
-  #     - name: Build
-  #       run: |
-  #         cmake -B build \
-  #           -DLLAMA_CURL=OFF \
-  #           -DCMAKE_BUILD_TYPE=Release \
-  #           -DGGML_OPENMP=OFF \
-  #           -DLLAMA_BUILD_EXAMPLES=ON \
-  #           -DLLAMA_BUILD_TOOLS=ON \
-  #           -DLLAMA_BUILD_TESTS=OFF \
-  #           -DCMAKE_SYSTEM_NAME=Linux \
-  #           -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #           -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #           -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-  #           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-  #           -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #           -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #           -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH \
-  #           -DGGML_RVV=ON \
-  #           -DGGML_RV_ZFH=ON \
-  #           -DGGML_RV_ZICBOP=ON \
-  #           -DGGML_CPU_RISCV64_SPACEMIT=ON \
-  #           -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1
-
-  #         cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -56,7 +56,7 @@ env:

 jobs:
  macOS-latest-cmake-arm64:
-    runs-on: macos-latest
+    runs-on: macos-14

    steps:
      - name: Clone
@@ -88,7 +88,6 @@ jobs:
            -DGGML_METAL_SHADER_DEBUG=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
-          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1

      - name: Test
        id: cmake_test
@@ -97,7 +96,7 @@ jobs:
          ctest -L 'main|curl' --verbose --timeout 900

  macOS-latest-cmake-x64:
-    runs-on: macos-15-intel
+    runs-on: macos-13

    steps:
      - name: Clone
@@ -127,8 +126,7 @@ jobs:
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+            -DGGML_RPC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -138,7 +136,7 @@ jobs:
          ctest -L main --verbose --timeout 900

  macOS-latest-cmake-arm64-webgpu:
-    runs-on: macos-latest
+    runs-on: macos-14

    steps:
      - name: Clone
@@ -192,10 +190,6 @@ jobs:
            os: ubuntu-22.04
          - build: 'arm64'
            os: ubuntu-22.04-arm
-          - build: 's390x'
-            os: ubuntu-24.04-s390x
-          - build: 'ppc64le'
-            os: ubuntu-24.04-ppc64le

    runs-on: ${{ matrix.os }}

@@ -207,31 +201,14 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-cpu-cmake-${{ matrix.build }}
+          key: ubuntu-cpu-cmake
          evict-old-files: 1d

-      - name: Build Dependencies
-        id: build_depends
+      - name: Dependencies
+        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-            python3 python3-pip python3-dev \
-            libjpeg-dev build-essential libcurl4-openssl-dev \
-            git-lfs
-
-      - name: Python Dependencies
-        id: python_depends
-        run: |
-          python3 -m pip install --upgrade pip
-          pip3 install ./gguf-py
-
-      - name: Swap Endianness
-        id: endianness
-        if: ${{ matrix.build == 's390x' }}
-        run: |
-          for f in models/*.gguf; do
-            echo YES | python3 gguf-py/gguf/scripts/gguf_convert_endian.py $f big
-          done
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -249,7 +226,6 @@ jobs:

      - name: Test llama2c conversion
        id: llama2c_test
-        if: ${{ matrix.build != 's390x' }}
        run: |
          cd build
          echo "Fetch tokenizer"
@@ -259,15 +235,6 @@ jobs:
          ./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

-      - name: Test llama2c (s390x)
-        id: llama2c_test_s390x
-        if: ${{ matrix.build == 's390x' }}
-        run: |
-          cd build
-          echo "Fetch llama2c big-endian model"
-          wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
-          ./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
-
  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest

@@ -362,11 +329,11 @@ jobs:
        id: checkout
        uses: actions/checkout@v4

-      # - name: ccache
-      #   uses: ggml-org/ccache-action@v1.2.16
-      #   with:
-      #     key: ubuntu-latest-cmake-rpc
-      #     evict-old-files: 1d
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-latest-cmake-rpc
+          evict-old-files: 1d

      - name: Dependencies
        id: depends
@@ -387,8 +354,8 @@ jobs:
          cd build
          ctest -L main --verbose

-  ubuntu-24-cmake-vulkan:
-    runs-on: ubuntu-24.04
+  ubuntu-22-cmake-vulkan:
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
@@ -398,39 +365,20 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-24-cmake-vulkan
+          key: ubuntu-22-cmake-vulkan
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Use Vulkan SDK Cache
-        uses: actions/cache@v4
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev

      - name: Build
        id: cmake_build
        run: |
-          source ./vulkan_sdk/setup-env.sh
          cmake -B build \
            -DGGML_VULKAN=ON
          cmake --build build --config Release -j $(nproc)
@@ -440,12 +388,11 @@ jobs:
        run: |
          cd build
          export GGML_VK_VISIBLE_DEVICES=0
-          export GGML_VK_DISABLE_F16=1
          # This is using llvmpipe and runs slower than other backends
          ctest -L main --verbose --timeout 4200

-  ubuntu-24-cmake-webgpu:
-    runs-on: ubuntu-24.04
+  ubuntu-22-cmake-webgpu:
+    runs-on: ubuntu-22.04

    steps:
      - name: Clone
@@ -455,34 +402,16 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-24-cmake-webgpu
+          key: ubuntu-22-cmake-webgpu
          evict-old-files: 1d

-      - name: Dependencies
-        id: depends
+      - name: Vulkan SDK Dependencies
+        id: vulkan-depends
        run: |
-          sudo add-apt-repository -y ppa:kisak/kisak-mesa
+          wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add -
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
          sudo apt-get update -y
-          sudo apt-get install -y build-essential mesa-vulkan-drivers libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev libcurl4-openssl-dev
-
-      - name: Get latest Vulkan SDK version
-        id: vulkan_sdk_version
-        run: |
-          echo "VULKAN_SDK_VERSION=$(curl https://vulkan.lunarg.com/sdk/latest/linux.txt)" >> "$GITHUB_ENV"
-
-      - name: Use Vulkan SDK Cache
-        uses: actions/cache@v4
-        id: cache-sdk
-        with:
-          path: ./vulkan_sdk
-          key: vulkan-sdk-${{ env.VULKAN_SDK_VERSION }}-${{ runner.os }}
-
-      - name: Setup Vulkan SDK
-        if: steps.cache-sdk.outputs.cache-hit != 'true'
-        uses: ./.github/actions/linux-setup-vulkan
-        with:
-          path: ./vulkan_sdk
-          version: ${{ env.VULKAN_SDK_VERSION }}
+          sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk libcurl4-openssl-dev

      - name: Dawn Dependency
        id: dawn-depends
@@ -525,7 +454,7 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev rocwmma-dev
+          sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev libcurl4-openssl-dev

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -544,7 +473,7 @@ jobs:

  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
-    container: mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
+    container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64

    steps:
      - name: Clone
@@ -780,7 +709,6 @@ jobs:

  macOS-latest-swift:
    runs-on: macos-latest
-    needs: ios-xcode-build

    strategy:
      matrix:
@@ -797,12 +725,6 @@ jobs:
          key: macOS-latest-swift
          evict-old-files: 1d

-      - name: Download xcframework artifact
-        uses: actions/download-artifact@v4
-        with:
-          name: llama-xcframework
-          path: build-apple/llama.xcframework/
-
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -824,6 +746,11 @@ jobs:
            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          ./build-xcframework.sh
+
  windows-msys2:
    runs-on: windows-2025

@@ -1097,7 +1024,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
@@ -1123,49 +1050,34 @@ jobs:
        run:  examples/sycl/win-build-sycl.bat

  windows-latest-cmake-hip:
+    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-2022

-    env:
-      # The ROCm version must correspond to the version used in the HIP SDK.
-      ROCM_VERSION: "6.4.2"
-      # Make sure this is in sync with build-cache.yml
-      HIPSDK_INSTALLER_VERSION: "25.Q3"
-
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

-      - name: Grab rocWMMA package
-        id: grab_rocwmma
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/${{ env.ROCM_VERSION }}/pool/main/r/rocwmma-dev/rocwmma-dev_1.7.0.60402-120~24.04_amd64.deb"
-          7z x rocwmma.deb
-          7z x data.tar
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1

-      - name: Use ROCm Installation Cache
-        uses: actions/cache@v4
-        id: cache-rocm
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
-
-      - name: Setup ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
-        uses: ./.github/actions/windows-setup-rocm
-        with:
-          version: ${{ env.HIPSDK_INSTALLER_VERSION }}
+      - name: Install
+        id: depends
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+          $proc.WaitForExit(600000)
+          write-host "Completed AMD HIP SDK installation"

      - name: Verify ROCm
        id: verify
        run: |
-          # Find and test ROCm installation
-          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
-          if (-not $clangPath) {
-            Write-Error "ROCm installation not found"
-            exit 1
-          }
-          & $clangPath.FullName --version
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

      - name: Install ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -1187,9 +1099,8 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-${{ env.ROCM_VERSION }}/include/" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" `
            -DCMAKE_BUILD_TYPE=Release `
-            -DROCM_DIR="${env:HIP_PATH}" `
            -DGGML_HIP=ON `
            -DGGML_HIP_ROCWMMA_FATTN=ON `
            -DGGML_RPC=ON `
@@ -1230,17 +1141,8 @@ jobs:
        run: |
          ./build-xcframework.sh

-      - name: Upload xcframework artifact
-        uses: actions/upload-artifact@v4
-        with:
-          name: llama-xcframework
-          path: build-apple/llama.xcframework/
-          retention-days: 1
-
      - name: Build Xcode project
-        run: |
-          xcodebuild -downloadPlatform iOS
-          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

  android-build:
    runs-on: ubuntu-latest
@@ -1249,12 +1151,11 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4

-      # Disabled due to size (400MB) and always 0 cache hits
-      # - name: ccache
-      #   uses: ggml-org/ccache-action@v1.2.16
-      #   with:
-      #     key: android-build
-      #     evict-old-files: 1d
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: android-build
+          evict-old-files: 1d

      - name: Set up JDK
        uses: actions/setup-java@v3
@@ -1306,238 +1207,3 @@ jobs:
              -DGGML_CANN=on \
              -DSOC_TYPE=${{ matrix.device }}
          cmake --build build -j $(nproc)
-
-# TODO: simplify the following workflows using a matrix
-# TODO: run lighter CI on PRs and the full CI only on master (if needed)
-  ggml-ci-x64-cpu-low-perf:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ggml-ci-x64-cpu-low-perf
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-low-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ggml-ci-arm64-cpu-low-perf
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-x64-cpu-high-perf:
-    runs-on: ubuntu-22.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ggml-ci-x64-cpu-high-perf
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-high-perf:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ggml-ci-arm64-cpu-high-perf
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-arm64-cpu-high-perf-sve:
-    runs-on: ubuntu-22.04-arm
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.16
-        with:
-          key: ggml-ci-arm64-cpu-high-perf-sve
-          evict-old-files: 1d
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential libcurl4-openssl-dev
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-  ggml-ci-x64-nvidia-cuda:
-    runs-on: [self-hosted, Linux, X64, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          nvidia-smi
-          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-nvidia-vulkan-cm:
-    runs-on: [self-hosted, Linux, X64, NVIDIA]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-nvidia-vulkan-cm2:
-    runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-x64-cpu-amx:
-    runs-on: [self-hosted, Linux, X64, CPU, AMX]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
-
-  ggml-ci-mac-metal:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-mac-vulkan:
-    runs-on: [self-hosted, macOS, ARM64]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
-
-  ggml-ci-arm64-cpu-kleidiai:
-     runs-on: ubuntu-22.04-arm
-
-     steps:
-       - name: Clone
-         id: checkout
-         uses: actions/checkout@v4
-
-       - name: ccache
-         uses: ggml-org/ccache-action@v1.2.16
-         with:
-           key: ggml-ci-arm64-cpu-kleidiai
-           evict-old-files: 1d
-
-       - name: Dependencies
-         id: depends
-         run: |
-           sudo apt-get update
-           sudo apt-get install -y build-essential libcurl4-openssl-dev
-
-       - name: Test
-         id: ggml-ci
-         run: |
-           GG_BUILD_KLEIDIAI=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -28,7 +28,7 @@ jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub

-    runs-on: ${{ matrix.config.runs_on }}
+    runs-on: ubuntu-22.04
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
@@ -39,12 +39,11 @@ jobs:
          # Note: the arm64 images are failing, which prevents the amd64 images from being built
          # https://github.com/ggml-org/llama.cpp/issues/11888
          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
-          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "s390x",  dockerfile: ".devops/s390x.Dockerfile",  platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
    steps:
@@ -54,7 +53,6 @@ jobs:
          fetch-depth: 0 # preserve git history, so we can determine the build number

      - name: Set up QEMU
-        if: ${{ matrix.config.tag != 's390x' }}
        uses: docker/setup-qemu-action@v3
        with:
          image: tonistiigi/binfmt:qemu-v7.0.0-28
@@ -69,19 +67,22 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      - name: Determine source tag name
-        id: srctag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-      - name: Determine image tag name
+      - name: Determine tag name
        id: tag
        shell: bash
        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
          REPO_NAME="${{ github.event.repository.name }}"

+          # determine tag name postfix (build number, commit hash)
+          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
+            TAG_POSTFIX="-b${BUILD_NUMBER}"
+          else
+            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
+            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
+          fi
          # list all tags possible
          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
              TYPE=""
@@ -89,19 +90,17 @@ jobs:
              TYPE="-${{ matrix.config.tag }}"
          fi
          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
-          CACHETAGS="${PREFIX}buildcache${TYPE}"
-          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}-${{ steps.srctag.outputs.name }}"
-          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}-${{ steps.srctag.outputs.name }}"
-          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}-${{ steps.srctag.outputs.name }}"
-          echo "cache_output_tags=$CACHETAGS" >> $GITHUB_OUTPUT
+          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
+          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
+          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
-          echo "cache_output_tags=$CACHETAGS"  # print out for debugging
          echo "full_output_tags=$FULLTAGS"  # print out for debugging
          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
+          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'

      - name: Free Disk Space (Ubuntu)
@@ -134,14 +133,11 @@ jobs:
          target: full
          provenance: false
          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
-          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

      - name: Build and push Light Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
@@ -156,14 +152,11 @@ jobs:
          target: light
          provenance: false
          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
-          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max

      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
@@ -178,37 +171,8 @@ jobs:
          target: server
          provenance: false
          # using github experimental cache
-          #cache-from: type=gha
-          #cache-to: type=gha,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
-          # using registry cache (no storage limit)
-          cache-from: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }}
-          cache-to: type=registry,ref=${{ steps.tag.outputs.cache_output_tags }},mode=max
-
-  create_tag:
-    name: Create and push git tag
-    runs-on: ubuntu-22.04
-    permissions:
-      contents: write
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Determine source tag name
-        id: srctag
-        uses: ./.github/actions/get-tag-name
-        env:
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-
-      - name: Create and push git tag
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          git tag ${{ steps.srctag.outputs.name }} || exit 0
-          git push origin ${{ steps.srctag.outputs.name }} || exit 0
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -75,7 +75,7 @@ jobs:
          name: llama-bin-macos-arm64.zip

  macOS-x64:
-    runs-on: macos-15-intel
+    runs-on: macos-13

    steps:
      - name: Clone
@@ -108,8 +108,7 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
+            -DGGML_RPC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Determine tag name
@@ -150,7 +149,7 @@ jobs:
      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: ubuntu-cpu-cmake-${{ matrix.build }}
+          key: ubuntu-cpu-cmake
          evict-old-files: 1d

      - name: Dependencies
@@ -462,7 +461,7 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7cd9bba0-7aab-4e30-b3ae-2221006a4a05/intel-oneapi-base-toolkit-2025.1.1.34_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"

@@ -505,7 +504,6 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
@@ -514,15 +512,10 @@ jobs:
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin

          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin

-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin
-
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-bin-win-sycl-x64.zip ./build/bin/*

@@ -535,71 +528,43 @@ jobs:
  windows-hip:
    runs-on: windows-2022

-    env:
-      HIPSDK_INSTALLER_VERSION: "25.Q3"
-
    strategy:
      matrix:
        include:
          - name: "radeon"
-            gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4

-      - name: Grab rocWMMA package
-        id: grab_rocwmma
+      - name: Clone rocWMMA repository
+        id: clone_rocwmma
        run: |
-          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.0.1/pool/main/r/rocwmma-dev/rocwmma-dev_2.0.0.70001-42~24.04_amd64.deb"
-          7z x rocwmma.deb
-          7z x data.tar
-
-      - name: Cache ROCm Installation
-        id: cache-rocm
-        uses: actions/cache@v4
-        with:
-          path: C:\Program Files\AMD\ROCm
-          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
+          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
          evict-old-files: 1d

-      - name: Install ROCm
-        if: steps.cache-rocm.outputs.cache-hit != 'true'
+      - name: Install
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $completed = $proc.WaitForExit(600000)
-          if (-not $completed) {
-              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
-              $proc.Kill()
-              exit 1
-          }
-          if ($proc.ExitCode -ne 0) {
-              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
-              exit 1
-          }
+          $proc.WaitForExit(600000)
          write-host "Completed AMD HIP SDK installation"

      - name: Verify ROCm
        id: verify
        run: |
-          # Find and test ROCm installation
-          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
-          if (-not $clangPath) {
-            Write-Error "ROCm installation not found"
-            exit 1
-          }
-          & $clangPath.FullName --version
+          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version

      - name: Build
        id: cmake_build
@@ -609,7 +574,7 @@ jobs:
          cmake -G "Unix Makefiles" -B build -S . `
            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
-            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.0.1/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
            -DCMAKE_BUILD_TYPE=Release `
            -DGGML_BACKEND_DL=ON `
            -DGGML_NATIVE=OFF `
@@ -620,12 +585,9 @@ jobs:
            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
-          md "build\bin\hipblaslt\library"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"

      - name: Pack artifacts
        id: pack_artifacts
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -76,206 +76,51 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt

-  webui-setup:
-    name: WebUI Setup
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
+      # Setup nodejs (to be used for verifying bundled index.html)
+      - uses: actions/setup-node@v4
        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+          node-version: '22.11.0'

-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
-
-      - name: Cache node_modules
-        uses: actions/cache@v4
-        id: cache-node-modules
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Install dependencies
-        if: steps.cache-node-modules.outputs.cache-hit != 'true'
-        run: npm ci
-        working-directory: tools/server/webui
-
-  webui-check:
-    needs: webui-setup
-    name: WebUI Check
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Run type checking
-        run: npm run check
-        working-directory: tools/server/webui
-
-      - name: Run linting
-        run: npm run lint
-        working-directory: tools/server/webui
-
-  webui-build:
-    needs: webui-check
-    name: WebUI Build
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Build application
-        run: npm run build
-        working-directory: tools/server/webui
-
-  webui-tests:
-    needs: webui-build
-    name: Run WebUI tests
-    permissions:
-      contents: read
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-
-      - name: Restore node_modules cache
-        uses: actions/cache@v4
-        with:
-          path: tools/server/webui/node_modules
-          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
-          restore-keys: |
-            ${{ runner.os }}-node-modules-
-
-      - name: Install Playwright browsers
-        run: npx playwright install --with-deps
-        working-directory: tools/server/webui
-
-      - name: Build Storybook
-        run: npm run build-storybook
-        working-directory: tools/server/webui
-
-      - name: Run Client tests
-        run: npm run test:client
-        working-directory: tools/server/webui
-
-      - name: Run Server tests
-        run: npm run test:server
-        working-directory: tools/server/webui
-
-      - name: Run UI tests
-        run: npm run test:ui
-        working-directory: tools/server/webui
-
-      - name: Run E2E tests
-        run: npm run test:e2e
-        working-directory: tools/server/webui
-
-  server-build:
-    needs: [webui-tests]
-    runs-on: ubuntu-latest
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
-        build_type: [RelWithDebInfo]
-        include:
-          - build_type: Release
-            sanitizer: ""
-      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
-
-    steps:
-      - name: Dependencies
-        id: depends
+      - name: WebUI - Install dependencies
+        id: webui_lint
        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
-            build-essential \
-            xxd \
-            git \
-            cmake \
-            curl \
-            wget \
-            language-pack-en \
-            libcurl4-openssl-dev
+          cd tools/server/webui
+          npm ci

-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
-
-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
+      - name: WebUI - Check code format
+        id: webui_format
        run: |
-          pip install -r tools/server/tests/requirements.txt
+          git config --global --add safe.directory $(realpath .)
+          cd tools/server/webui
+          git status

-      - name: Setup Node.js for WebUI
-        uses: actions/setup-node@v4
-        with:
-          node-version: "22"
-          cache: "npm"
-          cache-dependency-path: "tools/server/webui/package-lock.json"
+          npm run format
+          git status
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Files do not follow coding style. To fix: npm run format"
+            echo "${modified_files}"
+            exit 1
+          fi

-      - name: Install WebUI dependencies
-        run: npm ci
-        working-directory: tools/server/webui
+      - name: Verify bundled index.html
+        id: verify_server_index_html
+        run: |
+          git config --global --add safe.directory $(realpath .)
+          cd tools/server/webui
+          git status

-      - name: Build WebUI
-        run: npm run build
-        working-directory: tools/server/webui
+          npm run build
+          git status
+          modified_files="$(git status -s)"
+          echo "Modified files: ${modified_files}"
+          if [ -n "${modified_files}" ]; then
+            echo "Repository is dirty or server/webui is not built as expected"
+            echo "Hint: You may need to follow Web UI build guide in server/README.md"
+            echo "${modified_files}"
+            exit 1
+          fi

      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
--- a/.gitignore
+++ b/.gitignore
@@ -148,7 +148,3 @@ poetry.toml
 /run-vim.sh
 /run-chat.sh
 .ccache/
-
-# IDE
-*.code-workspace
-.windsurf/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,12 +58,6 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

-if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
-    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
-else()
-    set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
-endif()
-
 #
 # option list
 #
@@ -88,11 +82,9 @@ option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
-option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})

 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
-option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" OFF)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

 # Required for relocatable CMake package
--- a/124
+++ b/124
@@ -1,116 +1,12 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
-# multiplie collaborators per item can be specified

-/.devops/*.Dockerfile                   @ngxson
-/.github/actions/                       @slaren @CISC
-/.github/workflows/                     @CISC
-/.github/workflows/release.yml          @slaren
-/.github/workflows/winget.yml           @slaren
-/ci/                                    @ggerganov
-/cmake/                                 @ggerganov
-/common/CMakeLists.txt                  @ggerganov
-/common/arg.*                           @ggerganov @ericcurtin
-/common/base64.hpp.*                    @ggerganov
-/common/build-info.*                    @ggerganov
-/common/common.*                        @ggerganov
-/common/console.*                       @ggerganov
-/common/http.*                          @angt
-/common/llguidance.*                    @ggerganov
-/common/log.*                           @ggerganov
-/common/sampling.*                      @ggerganov
-/common/speculative.*                   @ggerganov
-/convert_*.py                           @CISC
-/examples/batched.swift/                @ggerganov
-/examples/batched/                      @ggerganov
-/examples/convert-llama2c-to-ggml/      @ggerganov
-/examples/deprecation-warning/          @ggerganov
-/examples/diffusion/                    @am17an
-/examples/embedding/                    @ggerganov
-/examples/eval-callback/                @ggerganov
-/examples/export-docs/                  @ggerganov
-/examples/gen-docs/                     @ggerganov
-/examples/gguf/                         @ggerganov
-/examples/llama.android/                @ggerganov
-/examples/llama.swiftui/                @ggerganov
-/examples/llama.vim                     @ggerganov
-/examples/lookahead/                    @ggerganov
-/examples/lookup/                       @JohannesGaessler
-/examples/model-conversion/             @danbev
-/examples/parallel/                     @ggerganov
-/examples/passkey/                      @ggerganov
-/examples/retrieval/                    @ggerganov
-/examples/save-load-state/              @ggerganov
-/examples/simple-chat/                  @slaren
-/examples/simple/                       @slaren
-/examples/speculative-simple/           @ggerganov
-/examples/speculative/                  @ggerganov
-/ggml/cmake/                            @ggerganov
-/ggml/include/                          @ggerganov @slaren
-/ggml/src/ggml-alloc.c                  @slaren
-/ggml/src/ggml-backend*                 @slaren
-/ggml/src/ggml-blas/                    @slaren
-/ggml/src/ggml-common.h                 @ggerganov @slaren
-/ggml/src/ggml-cpu/                     @ggerganov @slaren
-/ggml/src/ggml-cpu/spacemit/            @alex-spacemit
-/ggml/src/ggml-cuda/common.cuh          @slaren
-/ggml/src/ggml-cuda/fattn*              @JohannesGaessler
-/ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
-/ggml/src/ggml-cuda/mmf.*               @JohannesGaessler
-/ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
-/ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
-/ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
-/ggml/src/ggml-cuda/fattn-wmma*         @IMbackK
-/ggml/src/ggml-hip/                     @IMbackK
-/ggml/src/ggml-cuda/vendors/hip.h       @IMbackK
-/ggml/src/ggml-impl.h                   @ggerganov @slaren
-/ggml/src/ggml-metal/                   @ggerganov
-/ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
-/ggml/src/ggml-opt.cpp                  @JohannesGaessler
-/ggml/src/ggml-quants.*                 @ggerganov
-/ggml/src/ggml-rpc/                     @rgerganov
-/ggml/src/ggml-threading.*              @ggerganov @slaren
-/ggml/src/ggml-vulkan/                  @0cc4m
-/ggml/src/ggml-webgpu/                  @reeselevine
-/ggml/src/ggml-zdnn/                    @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
-/ggml/src/ggml.c                        @ggerganov @slaren
-/ggml/src/ggml.cpp                      @ggerganov @slaren
-/ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
-/gguf-py/                               @CISC
-/media/                                 @ggerganov
-/scripts/gen*                           @ggerganov
-/scripts/get*                           @ggerganov
-/scripts/sync*                          @ggerganov
-/src/                                   @ggerganov
-/src/llama-adapter.*                    @CISC
-/src/llama-arch.*                       @CISC
-/src/llama-chat.*                       @ngxson
-/src/llama-graph.*                      @CISC
-/src/llama-model-loader.*               @slaren
-/src/llama-model.*                      @CISC
-/src/llama-vocab.*                      @CISC
-/tests/                                 @ggerganov
-/tests/test-backend-ops.cpp             @slaren
-/tests/test-thread-safety.cpp           @slaren
-/tools/batched-bench/                   @ggerganov
-/tools/llama-bench/                     @slaren
-/tools/main/                            @ggerganov
-/tools/mtmd/                            @ngxson
-/tools/perplexity/                      @ggerganov
-/tools/quantize/                        @ggerganov
-/tools/rpc/                             @rgerganov
-/tools/run/                             @ericcurtin
-/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
-/tools/server/webui/                    @allozaur
-/tools/tokenize/                        @ggerganov
-/tools/tts/                             @ggerganov
-/vendor/                                @ggerganov
-/.clang-format                          @slaren
-/.clang-tidy                            @slaren
-/AUTHORS                                @ggerganov
-/CMakeLists.txt                         @ggerganov
-/CONTRIBUTING.md                        @ggerganov
-/LICENSE                                @ggerganov
-/README.md                              @ggerganov
-/SECURITY.md                            @ggerganov
-/build-xcframework.sh                   @danbev
-requirements*.txt                       @CISC
+/ci/ @ggerganov
+/.devops/*.Dockerfile @ngxson
+/tools/server/ @ngxson
+/ggml/src/ggml-cuda/fattn* @JohannesGaessler
+/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
+/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
+/ggml/src/ggml-opt.cpp @JohannesGaessler
+/ggml/src/gguf.cpp @JohannesGaessler
+/ggml/src/ggml-vulkan/ @0cc4m
+/ggml/src/ggml-zdnn/ @taronaeo
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,4 @@
-# Contributors
-
-The project differentiates between 3 levels of contributors:
-
- Contributors: people who have contributed before (no special privileges)
- Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
- Maintainers: responsible for reviewing and merging PRs, after approval from the code owners
-
-# Pull requests (for contributors & collaborators)
+# Pull requests (for contributors)

 - llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
@@ -17,16 +9,15 @@ The project differentiates between 3 levels of contributors:
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs

-# Pull requests (for maintainers)
+# Pull requests (for collaborators)

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Let other maintainers merge their own PRs
- When merging a PR, make sure you have a good understanding of the changes
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
+- Let authors, who are also collaborators, merge their own PRs
+- When merging a PR by a contributor, make sure you have a good understanding of the changes
 - Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

 # Coding guidelines
@@ -126,21 +117,6 @@ The project differentiates between 3 levels of contributors:
    #endif // FOO
    ```

-# Code maintenance
-
- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
-  - Reviewing and merging related PRs
-  - Fixing related bugs
-  - Providing developer guidance/support
-
- When adding or modifying a large piece of code:
-  - If you are a collaborator, make sure to add yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
-  - If you are a contributor, find an existing collaborator who is willing to review and maintain your code long-term
-  - Provide the necessary CI workflow (and hardware) to test your changes (see [ci/README.md](https://github.com/ggml-org/llama.cpp/tree/master/ci))
-
- New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
-  _(NOTE: for legacy reasons, existing code is not required to follow this guideline)_
-
 # Documentation

 - Documentation is a community effort
--- a/README.md
+++ b/README.md
@@ -178,7 +178,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Java: [QuasarByte/llama-cpp-jna](https://github.com/QuasarByte/llama-cpp-jna)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
@@ -275,7 +274,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
-| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |

@@ -522,8 +520,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
 ## Contributing

 - Contributors can open PRs
+- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
- Maintainers can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Any help with managing issues, PRs and projects is very appreciated!
 - See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -422,7 +422,6 @@ echo "Building for iOS devices..."
 cmake -B build-ios-device -G Xcode \
    "${COMMON_CMAKE_ARGS[@]}" \
    -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \
-    -DCMAKE_SYSTEM_NAME=iOS \
    -DCMAKE_OSX_SYSROOT=iphoneos \
    -DCMAKE_OSX_ARCHITECTURES="arm64" \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
--- a/ci/README-MUSA.md
+++ b/ci/README-MUSA.md
@@ -1,35 +0,0 @@
-## Running MUSA CI in a Docker Container
-
-Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
-
-### 1. Create a local directory to store cached models, configuration files and venv:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-cache
-```
-
-### 2. Create a local directory to store CI run results:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-results
-```
-
-### 3. Start a Docker container and run the CI:
-
-```bash
-docker run --privileged -it \
-    -v $HOME/llama.cpp/ci-cache:/ci-cache \
-    -v $HOME/llama.cpp/ci-results:/ci-results \
-    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.3.0-devel-ubuntu22.04-amd64
-```
-
-Inside the container, execute the following commands:
-
-```bash
-apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
-git config --global --add safe.directory /ws
-GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
-```
-
-This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,10 +1,18 @@
 # CI

-This CI implements heavy-duty workflows that run on self-hosted runners. Typically the purpose of these workflows is to
-cover hardware configurations that are not available from Github-hosted runners and/or require more computational
-resource than normally available.
+In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:

-It is a good practice, before publishing changes to execute the full CI locally on your machine. For example:
+https://github.com/ggml-org/ci
+
+It monitors the `master` branch for new commits and runs the
+[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
+to cover various hardware architectures, including GPU and Apple Silicon instances.
+
+Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
+Only the branches of this repo are monitored for this keyword.
+
+It is a good practice, before publishing changes to execute the full CI locally on your machine:

 ```bash
 mkdir tmp
@@ -21,13 +29,40 @@ GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

 # with MUSA support
 GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-# etc.
 ```

-# Adding self-hosted runners
+## Running MUSA CI in a Docker Container

- Add a self-hosted `ggml-ci` workflow to [[.github/workflows/build.yml]] with an appropriate label
- Request a runner token from `ggml-org` (for example, via a comment in the PR or email)
- Set-up a machine using the received token ([docs](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners))
- Optionally update [ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) to build and run on the target platform by gating the implementation with a `GG_BUILD_...` env
+Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
+
+### 1. Create a local directory to store cached models, configuration files and venv:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-cache
+```
+
+### 2. Create a local directory to store CI run results:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-results
+```
+
+### 3. Start a Docker container and run the CI:
+
+```bash
+docker run --privileged -it \
+    -v $HOME/llama.cpp/ci-cache:/ci-cache \
+    -v $HOME/llama.cpp/ci-results:/ci-results \
+    -v $PWD:/ws -w /ws \
+    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
+```
+
+Inside the container, execute the following commands:
+
+```bash
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
+git config --global --add safe.directory /ws
+GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
+```
+
+This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -22,9 +22,6 @@
 # # with MUSA support
 # GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with KLEIDIAI support
-# GG_BUILD_KLEIDIAI=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -37,9 +34,9 @@ mkdir -p "$2"
 OUT=$(realpath "$1")
 MNT=$(realpath "$2")

-rm -f $OUT/*.log
-rm -f $OUT/*.exit
-rm -f $OUT/*.md
+rm -f "$OUT/*.log"
+rm -f "$OUT/*.exit"
+rm -f "$OUT/*.md"

 sd=`dirname $0`
 cd $sd/../
@@ -48,7 +45,7 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -68,16 +65,6 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    fi
 fi

-if [ ! -z ${GG_BUILD_ROCM} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
-    if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
-        echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
-        exit 1
-    fi
-
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
-fi
-
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
@@ -95,12 +82,6 @@ fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
-
-    # if on Mac, disable METAL
-    if [[ "$OSTYPE" == "darwin"* ]]; then
-        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
-    fi
-
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@@ -112,40 +93,6 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
    MUSA_ARCH=${MUSA_ARCH:-21}
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
 fi
-
-if [ ! -z ${GG_BUILD_NO_SVE} ]; then
-    # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
-fi
-
-if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
-    echo ">>===== Enabling KleidiAI support"
-
-    CANDIDATES=("armv9-a+dotprod+i8mm" "armv8.6-a+dotprod+i8mm" "armv8.2-a+dotprod")
-    CPU=""
-
-    for cpu in "${CANDIDATES[@]}"; do
-        if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
-            CPU="$cpu"
-            break
-        fi
-    done
-
-    if [ -z "$CPU" ]; then
-        echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
-        exit 1
-    fi
-
-    echo ">>===== Using ARM baseline: ${CPU}"
-
-    CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
-        -DGGML_NATIVE=OFF \
-        -DGGML_CPU_KLEIDIAI=ON \
-        -DGGML_CPU_AARCH64=ON \
-        -DGGML_CPU_ARM_ARCH=${CPU} \
-        -DBUILD_SHARED_LIBS=OFF"
-fi
-
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -203,7 +150,7 @@ function gg_run_ctest_debug {
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -253,9 +200,33 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

-# test_scripts
+# test_scripts_debug

-function gg_run_test_scripts {
+function gg_run_test_scripts_debug {
+    cd ${SRC}
+
+    set -e
+
+    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# test_scripts_release
+
+function gg_run_test_scripts_release {
    cd ${SRC}

    set -e
@@ -266,10 +237,10 @@ function gg_run_test_scripts {
    set +e
 }

-function gg_sum_test_scripts {
+function gg_sum_test_scripts_release {
    gg_printf '### %s\n\n' "${ci}"

-    gg_printf 'Runs test scripts\n'
+    gg_printf 'Runs test scripts in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
@@ -278,9 +249,15 @@ function gg_sum_test_scripts {
 }

 function gg_get_model {
-    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
+    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
+    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
+    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
+    elif [[ -s $gguf_1 ]]; then
+        echo -n "$gguf_1"
+    elif [[ -s $gguf_2 ]]; then
+        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -293,9 +270,7 @@ function gg_run_ctest_with_model_debug {
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
-
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
-
    set +e
    cd ..
 }
@@ -306,15 +281,7 @@ function gg_run_ctest_with_model_release {
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
-
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
-
-    # test memory leaks
-    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
-    #    # TODO: this hangs for some reason ...
-    #    (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
-    #fi
-
    set +e
    cd ..
 }
@@ -339,22 +306,24 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

-# qwen3_0_6b
+# open_llama_7b_v2

-function gg_run_qwen3_0_6b {
+function gg_run_open_llama_7b_v2 {
    cd ${SRC}

-    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
-    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
-    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
-   #gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
-    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
-
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json

    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/

-    path_models="../models-mnt/qwen3/0.6B"
+    path_models="../models-mnt/open-llama/7B-v2"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"

    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@@ -364,11 +333,9 @@ function gg_run_qwen3_0_6b {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf

    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_bf16="${path_models}/ggml-model-bf16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
@@ -382,51 +349,179 @@ function gg_run_qwen3_0_6b {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
-    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    if [ -z ${GG_BUILD_NO_BF16} ]; then
-        (time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
-    fi
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_open_llama_7b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 7B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
+# pythia_1.4b
+
+function gg_run_pythia_1_4b {
+    cd ${SRC}
+
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/pythia/1.4B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -442,9 +537,6 @@ function gg_run_qwen3_0_6b {
    }

    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    if [ -z ${GG_BUILD_NO_BF16} ]; then
-        check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    fi
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@@ -461,17 +553,147 @@ function gg_run_qwen3_0_6b {
    set +e
 }

-function gg_sum_qwen3_0_6b {
+function gg_sum_pythia_1_4b {
    gg_printf '### %s\n\n' "${ci}"

-    gg_printf 'Qwen3 0.6B:\n'
+    gg_printf 'Pythia 1.4B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16:\n```\n%s\n```\n'  "$(cat $OUT/${ci}-tg-f16.log)"
-    if [ -z ${GG_BUILD_NO_BF16} ]; then
-        gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
-    fi
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
+# pythia_2_8b
+
+function gg_run_pythia_2_8b {
+    cd ${SRC}
+
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
+
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+
+    path_models="../models-mnt/pythia/2.8B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test="${path_wiki}/wiki.test.raw"
+
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+    set +e
+}
+
+function gg_sum_pythia_2_8b {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Pythia 2.8B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
@@ -543,7 +765,12 @@ function gg_run_rerank_tiny {
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+
+    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json

    path_models="../models-mnt/rerank-tiny"

@@ -633,8 +860,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
+if [ -z ${GG_BUILD_SYCL} ]; then
+    # SYCL build breaks with debug build flags
+    test $ret -eq 0 && gg_run ctest_debug
+fi
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -642,15 +871,24 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run test_scripts_debug
+        fi
+        test $ret -eq 0 && gg_run test_scripts_release
    fi

-    test $ret -eq 0 && gg_run qwen3_0_6b
-
-    test $ret -eq 0 && gg_run ctest_with_model_debug
-    test $ret -eq 0 && gg_run ctest_with_model_release
+    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
+        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
+            test $ret -eq 0 && gg_run pythia_1_4b
+        else
+            test $ret -eq 0 && gg_run pythia_2_8b
+            #test $ret -eq 0 && gg_run open_llama_7b_v2
+        fi
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run ctest_with_model_debug
+        fi
+        test $ret -eq 0 && gg_run ctest_with_model_release
+    fi
 fi

-cat $OUT/README.md
-
 exit $ret
--- a/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
+++ b/cmake/riscv64-spacemit-linux-gnu-gcc.cmake
@@ -1,29 +0,0 @@
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR riscv64)
-set(CMAKE_SYSTEM_VERSION 1)
-
-if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
-    message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-else()
-    set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
-    if (DEFINED ENV{RISCV_ROOT_PATH})
-        file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
-    else()
-        message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
-    endif()
-
-    set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
-    set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
-    set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
-    set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
-    set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
-    set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
-endif()
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -56,7 +56,6 @@ add_library(${TARGET} STATIC
    common.h
    console.cpp
    console.h
-    http.h
    json-partial.cpp
    json-partial.h
    json-schema-to-grammar.cpp
@@ -88,43 +87,7 @@ if (LLAMA_CURL)
    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
    include_directories(${CURL_INCLUDE_DIRS})
    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
-endif()
-
-if (LLAMA_OPENSSL)
-    find_package(OpenSSL)
-    if (OpenSSL_FOUND)
-        include(CheckCSourceCompiles)
-        set(SAVED_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
-        set(CMAKE_REQUIRED_INCLUDES ${OPENSSL_INCLUDE_DIR})
-        check_c_source_compiles("
-        #include <openssl/opensslv.h>
-        #if defined(OPENSSL_IS_BORINGSSL) || defined(LIBRESSL_VERSION_NUMBER)
-        #    if OPENSSL_VERSION_NUMBER < 0x1010107f
-        #        error bad version
-        #    endif
-        #else
-        #    if OPENSSL_VERSION_NUMBER < 0x30000000L
-        #        error bad version
-        #    endif
-        #endif
-        int main() { return 0; }
-        " OPENSSL_VERSION_SUPPORTED)
-        set(CMAKE_REQUIRED_INCLUDES ${SAVED_CMAKE_REQUIRED_INCLUDES})
-        if (OPENSSL_VERSION_SUPPORTED)
-            message(STATUS "OpenSSL found: ${OPENSSL_VERSION}")
-            target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_OPENSSL_SUPPORT)
-            target_link_libraries(${TARGET} PUBLIC OpenSSL::SSL OpenSSL::Crypto)
-            if (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-                target_compile_definitions(${TARGET} PUBLIC CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN)
-                find_library(CORE_FOUNDATION_FRAMEWORK CoreFoundation REQUIRED)
-                find_library(SECURITY_FRAMEWORK Security REQUIRED)
-                target_link_libraries(${TARGET} PUBLIC ${CORE_FOUNDATION_FRAMEWORK} ${SECURITY_FRAMEWORK})
-            endif()
-        endif()
-    else()
-        message(STATUS "OpenSSL not found, SSL support disabled")
-    endif()
-endif()
+endif ()

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/arg.h
+++ b/common/arg.h
@@ -78,6 +78,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e

 // function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool common_has_curl();

 struct common_remote_params {
    std::vector<std::string> headers;
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -3,12 +3,9 @@
 #include "log.h"
 #include "regex-partial.h"

-#include <algorithm>
-#include <cctype>
 #include <optional>
 #include <stdexcept>
 #include <string>
-#include <string_view>
 #include <vector>

 using json = nlohmann::ordered_json;
@@ -78,35 +75,6 @@ bool common_chat_msg_parser::add_tool_calls(const json & arr) {
    }
    return true;
 }
-
-bool common_chat_msg_parser::add_tool_call_short_form(const json & tool_call) {
-    if (!tool_call.is_object() || tool_call.size() != 1) {
-        return false;
-    }
-
-    // Get the tool name (the single key in the object)
-    auto it = tool_call.begin();
-    std::string name = it.key();
-
-    if (name.empty()) {
-        return false;
-    }
-
-    // Get the arguments (the nested object)
-    const json & args_json = it.value();
-    std::string arguments = "";
-
-    if (args_json.is_object()) {
-        arguments = args_json.dump();
-    } else if (args_json.is_string()) {
-        arguments = args_json;
-    } else if (!args_json.is_null()) {
-        // For other types, convert to string representation
-        arguments = args_json.dump();
-    }
-
-    return add_tool_call(name, "", arguments);
-}
 void common_chat_msg_parser::finish() {
    if (!is_partial_ && pos_ != input_.size()) {
        throw std::runtime_error("Unexpected content at end of input");// + input_.substr(pos_));
@@ -169,27 +137,6 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
 }

 bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
-    std::string pending_reasoning_prefix;
-
-    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
-        return false;
-    }
-
-    auto set_reasoning_prefix = [&](size_t prefix_pos) {
-        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
-            return;
-        }
-        if (prefix_pos + start_think.size() > input_.size()) {
-            pending_reasoning_prefix.clear();
-            return;
-        }
-        // Capture the exact literal that opened the reasoning section so we can
-        // surface it back to callers. This ensures formats that force the
-        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
-        // instead of dropping it during parsing.
-        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
-    };
-
    auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
        auto stripped_reasoning = string_strip(reasoning);
        if (stripped_reasoning.empty()) {
@@ -202,116 +149,28 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
                add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
            }
        } else {
-            if (!pending_reasoning_prefix.empty()) {
-                add_reasoning_content(pending_reasoning_prefix);
-                pending_reasoning_prefix.clear();
-            }
            add_reasoning_content(stripped_reasoning);
        }
    };
-
-    const size_t saved_pos = pos_;
-    const size_t saved_content_size = result_.content.size();
-    const size_t saved_reasoning_size = result_.reasoning_content.size();
-
-    auto restore_state = [&]() {
-        move_to(saved_pos);
-        result_.content.resize(saved_content_size);
-        result_.reasoning_content.resize(saved_reasoning_size);
-    };
-
-    // Allow leading whitespace to be preserved as content when reasoning is present at the start
-    size_t cursor = pos_;
-    size_t whitespace_end = cursor;
-    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
-        ++whitespace_end;
-    }
-
-    if (whitespace_end >= input_.size()) {
-        restore_state();
-        if (syntax_.thinking_forced_open) {
-            auto rest = input_.substr(saved_pos);
+    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
+        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
+            if (auto res = try_find_literal(end_think)) {
+                handle_reasoning(res->prelude, /* closed */ true);
+                consume_spaces();
+                return true;
+            }
+            auto rest = consume_rest();
            if (!rest.empty()) {
                handle_reasoning(rest, /* closed */ !is_partial());
            }
-            move_to(input_.size());
+            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
+            // if (!syntax_.thinking_forced_open) {
+            //     throw common_chat_msg_partial_exception(end_think);
+            // }
            return true;
        }
-        return false;
-    }
-
-    cursor = whitespace_end;
-    const size_t remaining = input_.size() - cursor;
-    const size_t start_prefix = std::min(start_think.size(), remaining);
-    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
-
-    if (has_start_tag && start_prefix < start_think.size()) {
-        move_to(input_.size());
-        return true;
-    }
-
-    if (has_start_tag) {
-        if (whitespace_end > pos_) {
-            add_content(input_.substr(pos_, whitespace_end - pos_));
-        }
-        set_reasoning_prefix(cursor);
-        cursor += start_think.size();
-    } else if (syntax_.thinking_forced_open) {
-        cursor = whitespace_end;
-    } else {
-        restore_state();
-        return false;
-    }
-    while (true) {
-        if (cursor >= input_.size()) {
-            move_to(input_.size());
-            return true;
-        }
-
-        size_t end_pos = input_.find(end_think, cursor);
-        if (end_pos == std::string::npos) {
-            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
-            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
-            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
-            if (reasoning_end > cursor) {
-                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
-            }
-            move_to(input_.size());
-            return true;
-        }
-
-        if (end_pos > cursor) {
-            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
-        } else {
-            handle_reasoning("", /* closed */ true);
-        }
-
-        cursor = end_pos + end_think.size();
-
-        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
-            ++cursor;
-        }
-
-        const size_t next_remaining = input_.size() - cursor;
-        if (next_remaining == 0) {
-            move_to(cursor);
-            return true;
-        }
-
-        const size_t next_prefix = std::min(start_think.size(), next_remaining);
-        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
-            if (next_prefix < start_think.size()) {
-                move_to(input_.size());
-                return true;
-            }
-            set_reasoning_prefix(cursor);
-            cursor += start_think.size();
-            continue;
-        }
-
-        move_to(cursor);
-        return true;
    }
+    return false;
 }

 std::string common_chat_msg_parser::consume_rest() {
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -64,9 +64,6 @@ class common_chat_msg_parser {
    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
    bool add_tool_calls(const nlohmann::ordered_json & arr);

-    // Adds a tool call using the short form: { "tool_name": { "arg1": val, "arg2": val } }
-    bool add_tool_call_short_form(const nlohmann::ordered_json & tool_call);
-
    void finish();

    bool consume_spaces();
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -625,7 +625,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
        case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
        case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
-        case COMMON_CHAT_FORMAT_MAGISTRAL: return "Magistral";
        case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
        case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
        case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
@@ -639,7 +638,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
        case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
        case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
-        case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -803,7 +801,6 @@ static std::string apply(
    }
    tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
    tmpl_inputs.extra_context = inputs.extra_context;
-    tmpl_inputs.extra_context["enable_thinking"] = inputs.enable_thinking;
    if (additional_context) {
        tmpl_inputs.extra_context.merge_patch(*additional_context);
    }
@@ -985,65 +982,6 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
    data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
    return data;
 }
-
-static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-    data.prompt = apply(tmpl, inputs);
-    data.format = COMMON_CHAT_FORMAT_MAGISTRAL;
-    data.preserved_tokens = {
-        "[THINK]",
-        "[/THINK]",
-    };
-
-    if (inputs.tools.is_array() && !inputs.tools.empty()) {
-        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            auto schemas = json::array();
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                schemas.push_back({
-                    {"type", "object"},
-                    {"properties", {
-                        {"name", {
-                            {"type", "string"},
-                            {"const", function.at("name")},
-                        }},
-                        {"arguments", function.at("parameters")},
-                        {"id", {
-                            {"type", "string"},
-                            {"pattern", "^[a-zA-Z0-9]{9}$"},
-                        }},
-                    }},
-                    {"required", json::array({"name", "arguments", "id"})},
-                });
-            });
-            auto schema = json {
-                {"type", "array"},
-                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
-                {"minItems", 1},
-            };
-            if (!inputs.parallel_tool_calls) {
-                schema["maxItems"] = 1;
-            }
-            builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema));
-        });
-        data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"});
-        data.preserved_tokens.push_back("[TOOL_CALLS]");
-    } else {
-        data.grammar_lazy = false;
-        if (!inputs.json_schema.is_null()) {
-            if (!inputs.grammar.empty()) {
-                throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
-            }
-            data.grammar = json_schema_to_grammar(inputs.json_schema);
-        } else {
-            data.grammar = inputs.grammar;
-        }
-    }
-
-    return data;
-}
-
 static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
@@ -1054,18 +992,6 @@ static void common_chat_parse_mistral_nemo(common_chat_msg_parser & builder) {
    parse_prefixed_json_tool_call_array(builder, prefix);
 }

-static void common_chat_parse_magistral(common_chat_msg_parser & builder) {
-    builder.try_parse_reasoning("[THINK]", "[/THINK]");
-
-    if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-
-    static const common_regex prefix(regex_escape("[TOOL_CALLS]"));
-    parse_prefixed_json_tool_call_array(builder, prefix);
-}
-
 static common_chat_params common_chat_params_init_command_r7b(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;

@@ -1338,78 +1264,7 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_
    }
    return data;
 }
-
-static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-
-    // Generate the prompt using the apply() function with the template
-    data.prompt = apply(tmpl, inputs);
-    data.format = COMMON_CHAT_FORMAT_APERTUS;
-
-    // Handle thinking tags appropriately based on inputs.enable_thinking
-    if (string_ends_with(data.prompt, "<|inner_prefix|>")) {
-        if (!inputs.enable_thinking) {
-            data.prompt += "<|inner_suffix|>";
-        } else {
-            data.thinking_forced_open = true;
-        }
-    }
-
-    // When tools are present, build grammar for the <|tools_prefix|> format
-    if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
-        data.grammar_lazy = true;
-        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
-            auto schemas = json::array();
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                schemas.push_back({
-                    { "type",       "object"                                                   },
-                    { "properties",
-                        {
-                            { function.at("name"), function.at("parameters") }
-                        }                                                                        },
-                    { "required",   json::array({ function.at("name") }) },
-                });
-            });
-            auto schema = json{
-                        { "type",     "array"                                                         },
-                        { "items",    schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
-                        { "minItems", 1                                                               },
-            };
-            if (!inputs.parallel_tool_calls) {
-                schema["maxItems"] = 1;
-            }
-            builder.add_rule("root",
-                                std::string(data.thinking_forced_open ? "( \"<|inner_suffix|>\" space )? " : "") +
-                                    "\"<|tools_prefix|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tools_suffix|>\"");
-                            });
-        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
-            // If thinking_forced_open, then we capture the <|inner_suffix|> tag in the grammar,
-            // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
-            std::string(data.thinking_forced_open ?
-                            "[\\s\\S]*?(<\\|inner_suffix\\|>\\s*)" :
-                            "(?:<\\|inner_prefix\\|>[\\s\\S]*?<\\|inner_suffix\\|>\\s*)?") +
-                "(<\\|tools_prefix\\|>)[\\s\\S]*" });
-        data.preserved_tokens = {
-            "<|system_start|>",
-            "<|system_end|>",
-            "<|developer_start|>",
-            "<|developer_end|>",
-            "<|user_start|>",
-            "<|user_end|>",
-            "<|assistant_start|>",
-            "<|assistant_end|>",
-            "<|inner_prefix|>",
-            "<|inner_suffix|>",
-            "<|tools_prefix|>",
-            "<|tools_suffix|>",
-        };
-    }
-    return data;
-}
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
-    builder.try_parse_reasoning("<think>", "</think>");
-
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
        return;
@@ -1761,36 +1616,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
                );
            });

+            auto recipient_in_role = builder.add_rule("recipient_in_role",
+                "\"<|start|>assistant\"? \" to=functions.\" ( " +
+                string_join(tool_rules_recipient_in_role, " | ") + " )"
+            );
+
            auto recipient_in_channel = builder.add_rule("recipient_in_channel",
                channel + " \" to=functions.\" ( " +
                string_join(tool_rules_recipient_in_channel, " | ") + " )"
            );

-            if (data.grammar_lazy) {
-                auto recipient_in_role = builder.add_rule("recipient_in_role",
-                    "\"<|start|>assistant\"? \" to=functions.\" ( " +
-                    string_join(tool_rules_recipient_in_role, " | ") + " )"
-                );
-
-                builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
-            } else {
-                auto not_end = builder.add_rule("not-end",
-                    "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
-                auto analysis = builder.add_rule("analysis",
-                    "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
-                auto commentary = builder.add_rule("commentary",
-                    "\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
-
-                auto recipient_in_role = builder.add_rule("recipient_in_role",
-                    "\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
-                );
-
-                builder.add_rule("root",
-                    "( " + analysis + " \"<|start|>assistant\" )? " +
-                    "( " + commentary + " \"<|start|>assistant\" )? " +
-                    "( " + recipient_in_role + " | " + recipient_in_channel + " )"
-                );
-            }
+            builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);

            // Trigger on tool calls that appear in the commentary channel
            data.grammar_triggers.push_back({
@@ -1905,12 +1741,10 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
-    const std::optional<json> tools_override = json();
-    const std::optional<json> additional_context = json {
+    data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
        {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
        {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
-    };
-    data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
+    });
    if (inputs.tools.is_array() && !inputs.tools.empty()) {
        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2396,28 +2230,15 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp

 static void common_chat_parse_granite(common_chat_msg_parser & builder) {
    // Parse thinking tags
-    static const common_regex start_think_regex(regex_escape("<think>"));
-    static const common_regex end_think_regex(regex_escape("</think>"));
-    // Granite models output partial tokens such as "<" and "<think".
-    // By leveraging try_consume_regex()/try_find_regex() throwing
-    // common_chat_msg_partial_exception for these partial tokens,
-    // processing is interrupted and the tokens are not passed to add_content().
-    if (auto res = builder.try_consume_regex(start_think_regex)) {
-        // Restore position for try_parse_reasoning()
-        builder.move_to(res->groups[0].begin);
-        builder.try_find_regex(end_think_regex, std::string::npos, false);
-        // Restore position for try_parse_reasoning()
-        builder.move_to(res->groups[0].begin);
-    }
    builder.try_parse_reasoning("<think>", "</think>");

-    // Parse response tags
-    static const common_regex start_response_regex(regex_escape("<response>"));
-    static const common_regex end_response_regex(regex_escape("</response>"));
-    // Granite models output partial tokens such as "<" and "<response".
-    // Same hack as reasoning parsing.
-    if (builder.try_consume_regex(start_response_regex)) {
-        builder.try_find_regex(end_response_regex);
+    // Parse response tags using regex
+    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
+    if (auto res = builder.try_find_regex(response_regex)) {
+        // Extract the content between the tags (capture group 1)
+        auto content = builder.str(res->groups[1]);
+        builder.add_content(content);
+        builder.move_to(res->groups[0].end);
    }

    if (!builder.syntax().parse_tool_calls) {
@@ -2431,10 +2252,13 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
        builder.move_to(res->groups[0].end);

        // Expect JSON array of tool calls
-        if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
-            if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
-                throw common_chat_msg_partial_exception("incomplete tool call");
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            if (!builder.add_tool_calls(tool_calls_data.json)) {
+                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
            }
+        } else {
+            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
        }
    } else {
        builder.add_content(builder.consume_rest());
@@ -2468,37 +2292,6 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }

-static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
-    // Parse thinking tags
-    builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>");
-    if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-
-    // Look for tool calls
-    static const common_regex tool_call_regex(regex_escape("<|tools_prefix|>"));
-    if (auto res = builder.try_find_regex(tool_call_regex)) {
-        builder.move_to(res->groups[0].end);
-
-        auto tool_calls_data = builder.consume_json();
-        if (tool_calls_data.json.is_array()) {
-            builder.consume_spaces();
-            if (!builder.try_consume_literal("<|tools_suffix|>")) {
-                throw common_chat_msg_partial_exception("Incomplete tool call");
-            }
-            for (const auto & value : tool_calls_data.json) {
-                if (value.is_object()) {
-                    builder.add_tool_call_short_form(value);
-                }
-            }
-        } else {
-            throw common_chat_msg_partial_exception("Incomplete tool call");
-        }
-    }
-    builder.add_content(builder.consume_rest());
-}
-
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
    // Parse thinking tags first - this handles the main reasoning content
    builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2743,11 +2536,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_nemotron_v2(tmpl, params);
    }

-    // Apertus format detection
-    if (src.find("<|system_start|>") != std::string::npos && src.find("<|tools_prefix|>") != std::string::npos) {
-        return common_chat_params_init_apertus(tmpl, params);
-    }
-
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2776,10 +2564,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
    }

-    if (src.find("[THINK]") != std::string::npos && src.find("[/THINK]") != std::string::npos) {
-        return common_chat_params_init_magistral(tmpl, params);
-    }
-
    // Plain handler (no tools)
    if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
        return common_chat_params_init_without_tools(tmpl, params);
@@ -2864,7 +2648,6 @@ common_chat_params common_chat_templates_apply(
 }

 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
-    builder.try_parse_reasoning("<think>", "</think>");
    builder.add_content(builder.consume_rest());
 }

@@ -2881,9 +2664,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_MISTRAL_NEMO:
            common_chat_parse_mistral_nemo(builder);
            break;
-        case COMMON_CHAT_FORMAT_MAGISTRAL:
-            common_chat_parse_magistral(builder);
-            break;
        case COMMON_CHAT_FORMAT_LLAMA_3_X:
            common_chat_parse_llama_3_1(builder);
            break;
@@ -2923,9 +2703,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_NEMOTRON_V2:
            common_chat_parse_nemotron_v2(builder);
            break;
-        case COMMON_CHAT_FORMAT_APERTUS:
-            common_chat_parse_apertus(builder);
-            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.h
+++ b/common/chat.h
@@ -33,8 +33,8 @@ struct common_chat_msg_content_part {
 struct common_chat_msg {
    std::string role;
    std::string content;
-    std::vector<common_chat_msg_content_part> content_parts;
-    std::vector<common_chat_tool_call> tool_calls;
+    std::vector<common_chat_msg_content_part> content_parts = {};
+    std::vector<common_chat_tool_call> tool_calls = {};
    std::string reasoning_content;
    std::string tool_name;
    std::string tool_call_id;
@@ -44,7 +44,7 @@ struct common_chat_msg {
    bool empty() const {
        return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
    }
-    void set_tool_call_ids(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
+    void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
        for (auto i = 0u; i < tool_calls.size(); i++) {
            if (ids_cache.size() <= i) {
                auto id = tool_calls[i].id;
@@ -101,7 +101,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_CONTENT_ONLY,
    COMMON_CHAT_FORMAT_GENERIC,
    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
-    COMMON_CHAT_FORMAT_MAGISTRAL,
    COMMON_CHAT_FORMAT_LLAMA_3_X,
    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
@@ -115,7 +114,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
-    COMMON_CHAT_FORMAT_APERTUS,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -14,7 +14,6 @@
 #include <climits>
 #include <cmath>
 #include <codecvt>
-#include <chrono>
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
@@ -51,11 +50,6 @@
 #include <unistd.h>
 #endif

-#if defined(__linux__)
-#include <sys/types.h>
-#include <pwd.h>
-#endif
-
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@@ -870,20 +864,8 @@ std::string fs_get_cache_directory() {
 #if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
-        } else if (std::getenv("HOME")) {
-            cache_directory = std::getenv("HOME") + std::string("/.cache/");
        } else {
-#if defined(__linux__)
-            /* no $HOME is defined, fallback to getpwuid */
-            struct passwd *pw = getpwuid(getuid());
-            if ((!pw) || (!pw->pw_dir)) {
-                throw std::runtime_error("Failed to find $HOME directory");
-            }
-
-            cache_directory = std::string(pw->pw_dir) + std::string("/.cache/");
-#else /* defined(__linux__) */
-            throw std::runtime_error("Failed to find $HOME directory");
-#endif /* defined(__linux__) */
+            cache_directory = std::getenv("HOME") + std::string("/.cache/");
        }
 #elif defined(__APPLE__)
        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
@@ -978,13 +960,15 @@ struct common_init_result common_init_from_params(common_params & params) {

        bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
-        bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;

-        if (!has_eos && !has_sep && !has_rerank_prompt) {
-            LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
+        if (!has_eos && !has_sep) {
+            LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
            ok = false;
        } else if (!has_eos) {
            LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
+        } else if (!has_sep) {
+            LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
+            ok = false;
        }

        if (!ok) {
@@ -1133,7 +1117,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
-    mparams.no_host         = params.no_host;

    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
--- a/common/common.h
+++ b/common/common.h
@@ -193,11 +193,10 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path        = ""; // model local path                                       // NOLINT
-    std::string url         = ""; // model url to download                                  // NOLINT
-    std::string hf_repo     = ""; // HF repo                                                // NOLINT
-    std::string hf_file     = ""; // HF file                                                // NOLINT
-    std::string docker_repo = ""; // Docker repo                                            // NOLINT
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
 };

 struct common_params_speculative {
@@ -288,9 +287,9 @@ struct common_params {
    float   rope_freq_base        =  0.0f; // RoPE base frequency
    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
-    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
+    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
+    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
+    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length

    // offload params
@@ -378,7 +377,7 @@ struct common_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = false; // context shift on infinite text generation
+    bool ctx_shift         = false;  // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache

@@ -392,7 +391,6 @@ struct common_params {
    bool check_tensors     = false; // validate tensor data
    bool no_op_offload     = false; // globally disable offload host tensor operations to device
    bool no_extra_bufts    = false; // disable extra buffer types (used for weight repacking)
-    bool no_host           = false; // bypass host buffer allowing extra buffers to be used

    bool single_turn       = false; // single turn chat conversation

@@ -425,8 +423,7 @@ struct common_params {
    int32_t timeout_write     = timeout_read; // http write timeout in seconds
    int32_t n_threads_http    = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse     = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t n_ctx_checkpoints = 8;            // max number of context checkpoints per slot
-    int32_t cache_ram_mib     = 8192;         // 0 = no limit, 1 = 1 MiB, etc.
+    int32_t n_swa_checkpoints = 3;            // max number of SWA checkpoints per slot

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
@@ -434,7 +431,7 @@ struct common_params {
    std::string chat_template = "";                                                                         // NOLINT
    bool use_jinja = false;                                                                                 // NOLINT
    bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
    int reasoning_budget = -1;
    bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response

@@ -455,7 +452,7 @@ struct common_params {

    std::string slot_save_path;

-    float slot_prompt_similarity = 0.1f;
+    float slot_prompt_similarity = 0.5f;

    // batched-bench params
    bool is_pp_shared = false;
@@ -736,20 +733,6 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 }

-//
-// MoE utils
-//
-
-const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_(ch|)exps";
-
-static std::string llm_ffn_exps_block_regex(int idx) {
-    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
-}
-
-static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
-    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
-}
-
 //
 // training utils
 //
--- a/common/http.h
+++ b/common/http.h
@@ -1,73 +0,0 @@
-#pragma once
-
-#include <cpp-httplib/httplib.h>
-
-struct common_http_url {
-    std::string scheme;
-    std::string user;
-    std::string password;
-    std::string host;
-    std::string path;
-};
-
-static common_http_url common_http_parse_url(const std::string & url) {
-    common_http_url parts;
-    auto scheme_end = url.find("://");
-
-    if (scheme_end == std::string::npos) {
-        throw std::runtime_error("invalid URL: no scheme");
-    }
-    parts.scheme = url.substr(0, scheme_end);
-
-    if (parts.scheme != "http" && parts.scheme != "https") {
-        throw std::runtime_error("unsupported URL scheme: " + parts.scheme);
-    }
-
-    auto rest = url.substr(scheme_end + 3);
-    auto at_pos = rest.find('@');
-
-    if (at_pos != std::string::npos) {
-        auto auth = rest.substr(0, at_pos);
-        auto colon_pos = auth.find(':');
-        if (colon_pos != std::string::npos) {
-            parts.user = auth.substr(0, colon_pos);
-            parts.password = auth.substr(colon_pos + 1);
-        } else {
-            parts.user = auth;
-        }
-        rest = rest.substr(at_pos + 1);
-    }
-
-    auto slash_pos = rest.find('/');
-
-    if (slash_pos != std::string::npos) {
-        parts.host = rest.substr(0, slash_pos);
-        parts.path = rest.substr(slash_pos);
-    } else {
-        parts.host = rest;
-        parts.path = "/";
-    }
-    return parts;
-}
-
-static std::pair<httplib::Client, common_http_url> common_http_client(const std::string & url) {
-    common_http_url parts = common_http_parse_url(url);
-
-    if (parts.host.empty()) {
-        throw std::runtime_error("error: invalid URL format");
-    }
-
-    httplib::Client cli(parts.scheme + "://" + parts.host);
-
-    if (!parts.user.empty()) {
-        cli.set_basic_auth(parts.user, parts.password);
-    }
-
-    cli.set_follow_location(true);
-
-    return { std::move(cli), std::move(parts) };
-}
-
-static std::string common_http_show_masked_url(const common_http_url & parts) {
-    return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path;
-}
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -257,13 +257,12 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
 };

 static bool is_reserved_name(const std::string & name) {
-    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
-        std::unordered_set<std::string> s;
-        s.insert("root");
-        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
-        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
-        return s;
-    }();
+    static std::unordered_set<std::string> RESERVED_NAMES;
+    if (RESERVED_NAMES.empty()) {
+        RESERVED_NAMES.insert("root");
+        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
+    }
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }

--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -332,7 +332,6 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
    if (ctx) {
        llama_perf_context_print(ctx);
-        llama_memory_breakdown_print(ctx);
    }
 }

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -93,15 +93,13 @@ class ModelBase:
    # Mistral format specifics
    is_mistral_format: bool = False
    disable_mistral_community_chat_template: bool = False
-    sentence_transformers_dense_modules: bool = False

    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
-                 disable_mistral_community_chat_template: bool = False,
-                 sentence_transformers_dense_modules: bool = False):
+                 disable_mistral_community_chat_template: bool = False):
        if type(self) is ModelBase or \
                type(self) is TextModel or \
                type(self) is MmprojModel:
@@ -116,7 +114,6 @@ class ModelBase:
        self.lazy = not eager or (remote_hf_model_id is not None)
        self.dry_run = dry_run
        self.remote_hf_model_id = remote_hf_model_id
-        self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
        if remote_hf_model_id is not None:
            self.is_safetensors = True

@@ -738,9 +735,6 @@ class TextModel(ModelBase):
        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
            res = "qwen2"
-        if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
-            # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
-            res = "grok-2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@@ -891,12 +885,6 @@ class TextModel(ModelBase):
        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
            res = "mellum"
-        if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
-            # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
-            res = "llada-moe"
-        if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
-            # ref: https://huggingface.co/ibm-granite/granite-docling-258M
-            res = "granite-docling"

        if res is None:
            logger.warning("\n")
@@ -1331,7 +1319,6 @@ class MmprojModel(ModelBase):
        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)

        # load preprocessor config
-        self.preprocessor_config = {}
        if not self.is_mistral_format:
            with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
                self.preprocessor_config = json.load(f)
@@ -1354,8 +1341,7 @@ class MmprojModel(ModelBase):
            self.gguf_writer.add_vision_projection_dim(self.n_embd_text)

            # vision config
-            self.image_size = self.find_vparam(["image_size"])
-            self.gguf_writer.add_vision_image_size(self.image_size)
+            self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
            self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
            self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
            self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
@@ -2386,10 +2372,6 @@ class SmolVLMModel(MmprojModel):
        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
        self.gguf_writer.add_vision_use_gelu(True)

-        # Add the preprocessor longest edge size
-        preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
-        self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
-
    def tensor_force_quant(self, name, new_name, bid, n_dims):
        if ".embeddings." in name:
            return gguf.GGMLQuantizationType.F32
@@ -2405,10 +2387,7 @@ class SmolVLMModel(MmprojModel):
        return [] # skip other tensors


-@ModelBase.register(
-    "Llama4ForConditionalGeneration",
-    "Llama4ForCausalLM",
-)
+@ModelBase.register("Llama4ForConditionalGeneration")
 class Llama4Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA4
    undo_permute = False
@@ -2426,10 +2405,6 @@ class Llama4Model(LlamaModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
-        if "layer_types" in self.hparams:
-            if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
-                # all layers are full attention (for MobileLLM), disable swa
-                self.gguf_writer.add_sliding_window(0)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
        if name.startswith("language_model."):
@@ -2707,20 +2682,12 @@ class BitnetModel(TextModel):
        yield (new_name, data_torch)


-@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
+@ModelBase.register("GrokForCausalLM")
 class GrokModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GROK

    def set_vocab(self):
-        if (self.dir_model / 'tokenizer.model').is_file():
-            self._set_vocab_sentencepiece()
-            return
-
-        if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
-            logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
-            sys.exit(1)
-
-        self._set_vocab_gpt2()
+        self._set_vocab_sentencepiece()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -2728,46 +2695,11 @@ class GrokModel(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()

-        self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
-        self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
-        if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
-            self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
-
-        if (rope_dim := self.hparams.get("head_dim")) is None:
-            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-
-        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
-            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
-
-        # Treat "original" as "yarn", seems to have been a mistake
-        if self.hparams.get("rope_type") in ("yarn", "original"):
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
-            self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
-            self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
-
-        if temp_len := self.hparams.get("attn_temperature_len"):
-            self.gguf_writer.add_attn_temperature_length(temp_len)
-
-        self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
-        self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
-        self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
-
-    _experts: list[dict[str, list[Tensor]]] | None = None
-    _cur_expert = ""
+    _experts: list[dict[str, Tensor]] | None = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        tensors: list[tuple[str, Tensor]] = []
-        is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
-
-        if not is_expert:
-            tensors.append((self.map_tensor_name(name), data_torch))
-
        # process the experts separately
-        if is_expert or self._cur_expert:
+        if name.find(".moe.") != -1:
            n_experts = self.hparams["num_local_experts"]

            assert bid is not None
@@ -2775,41 +2707,32 @@ class GrokModel(TextModel):
            if self._experts is None:
                self._experts = [{} for _ in range(self.block_count)]

-            # concatenate split tensors
-            if name in self._experts[bid]:
-                self._cur_expert = name
-                self._experts[bid][name].append(data_torch)
-                return []
-            elif is_expert:
-                self._cur_expert = name
-                self._experts[bid][name] = [data_torch]
-                return []
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for wid in ["linear", "linear_1", "linear_v"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
            else:
-                self._cur_expert = ""
+                return []

-            for bid in range(self.block_count):
-                if len(self._experts[bid]) >= n_experts * 3:
-                    # merge the experts into a single 3d tensor
-                    for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
-                        datas: list[Tensor] = []
-
-                        for xid in range(n_experts):
-                            ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
-                            if ename not in self._experts[bid]:
-                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
-                            tensor_list = self._experts[bid][ename]
-                            datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
-                            del self._experts[bid][ename]
-
-                        data_torch = torch.stack(datas, dim=0)
-
-                        merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
-
-                        new_name = self.map_tensor_name(merged_name)
-
-                        yield (new_name, data_torch)
-
-        yield from tensors
+        return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("DbrxForCausalLM")
@@ -3729,29 +3652,11 @@ class Qwen2MoeModel(TextModel):
 class Qwen3Model(Qwen2Model):
    model_arch = gguf.MODEL_ARCH.QWEN3

-    # extra logic for rerank models
-    is_rerank: bool = False
-    is_tied_embeddings: bool = False
-    token_false_id: int | None = None
-    token_true_id: int | None = None
-
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-
-        # track for intern-s1-mini
        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

-        # a bit hacky, but currently the only way to detect if this is a rerank model
-        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
-        readme_path = self.dir_model / "README.md"
-        readme_text = ""
-        if readme_path.exists():
-            with readme_path.open("r", encoding="utf-8") as f:
-                readme_text = f.read()
-        if "# Qwen3-Reranker" in readme_text:
-            self._find_rerank_config()
-
    def set_vocab(self):
        # deal with intern-s1-mini
        if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
@@ -3760,53 +3665,6 @@ class Qwen3Model(Qwen2Model):

        super().set_vocab()

-    def _find_rerank_config(self):
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
-
-        self.is_rerank = True
-        self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
-        self.token_false_id = tokenizer.convert_tokens_to_ids("no")
-        self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
-        self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
-
-        assert self.token_false_id is not None and self.token_true_id is not None
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if self.is_rerank:
-            self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
-            self.gguf_writer.add_classifier_output_labels(["yes", "no"])
-            self.gguf_writer.add_chat_template([{
-                "name": "rerank",
-                "template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
-                            "<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
-                            "<|im_start|>assistant\n<think>\n\n</think>\n\n"
-            }])
-
-    def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
-        # extract "yes" and "no" tokens from the output lm_head tensor
-        false_row = data_torch[self.token_false_id]
-        true_row = data_torch[self.token_true_id]
-        return torch.stack([true_row, false_row], dim=0)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if self.is_rerank:
-            is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
-            is_real_head = not self.is_tied_embeddings and "lm_head" in name
-            if is_tied_head or is_real_head:
-                cls_out_head = (
-                    gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
-                    self._get_cls_out_tensor(data_torch),
-                )
-                if is_tied_head:
-                    embed = (self.map_tensor_name(name), data_torch)
-                    return [cls_out_head, embed]
-                if is_real_head:
-                    return [cls_out_head]
-
-        return super().modify_tensors(data_torch, name, bid)
-

@ModelBase.register("Qwen3MoeForCausalLM")
 class Qwen3MoeModel(Qwen2MoeModel):
@@ -4262,8 +4120,7 @@ class Plamo2Model(TextModel):
        # This logic matches modeling_plamo.py's is_mamba function
        mamba_step = hparams.get("mamba_step", 2)
        mamba_enabled = hparams.get("mamba_enabled", True)
-        num_key_value_heads = []
-        num_attention_heads = []
+        mamba_layers = []

        if mamba_enabled:
            for i in range(block_count):
@@ -4273,21 +4130,17 @@ class Plamo2Model(TextModel):
                else:
                    is_mamba = (i % mamba_step) != (mamba_step // 2)
                if is_mamba:
-                    num_key_value_heads.append(0)
-                    num_attention_heads.append(0)
+                    mamba_layers.append(0)
                else:
-                    num_key_value_heads.append(hparams.get("num_key_value_heads", 4))
-                    num_attention_heads.append(hparams.get("num_attention_heads", 32))
+                    mamba_layers.append(hparams.get("num_key_value_heads", 4))

-        if num_key_value_heads and num_attention_heads:
-            self.gguf_writer.add_head_count_kv(num_key_value_heads)
-            self.gguf_writer.add_head_count(num_attention_heads)
+        if mamba_layers:
+            self.gguf_writer.add_head_count_kv(mamba_layers)

        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
        self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
-        self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128))
-        self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
        self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))

@@ -5272,53 +5125,6 @@ class Gemma3Model(TextModel):
@ModelBase.register("Gemma3TextModel")
 class EmbeddingGemma(Gemma3Model):
    model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
-    module_paths = []
-    dense_features_dims = {}
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if self.sentence_transformers_dense_modules:
-            # read modules.json to determine if model has Dense layers
-            modules_file = self.dir_model / "modules.json"
-            if modules_file.is_file():
-                with open(modules_file, encoding="utf-8") as modules_json_file:
-                    mods = json.load(modules_json_file)
-                for mod in mods:
-                    if mod["type"] == "sentence_transformers.models.Dense":
-                        mod_path = mod["path"]
-                        # check if model.safetensors file for Dense layer exists
-                        model_tensors_file = self.dir_model / mod_path / "model.safetensors"
-                        if model_tensors_file.is_file():
-                            self.module_paths.append(mod_path)
-                            # read config.json of the Dense layer to get in/out features
-                            mod_conf_file = self.dir_model / mod_path / "config.json"
-                            if mod_conf_file.is_file():
-                                with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file:
-                                    mod_conf = json.load(mod_conf_json_file)
-                                    # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights
-                                    prefix = self._get_dense_prefix(mod_path)
-                                    if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None:
-                                        self.dense_features_dims[prefix] = (mod_conf["in_features"], mod_conf["out_features"])
-
-    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        from safetensors.torch import load_file
-        module_paths = list(self.module_paths)
-        for i, module_path in enumerate(module_paths):
-            tensors_file = self.dir_model / module_path / "model.safetensors"
-            local_tensors = load_file(tensors_file)
-            tensor_name = self._get_dense_prefix(module_path)
-            for name, local_tensor in local_tensors.items():
-                if not name.endswith(".weight"):
-                    continue
-                orig_name = name.replace("linear", tensor_name)
-                name = self.map_tensor_name(orig_name)
-                yield name, local_tensor.clone()
-
-    @staticmethod
-    def _get_dense_prefix(module_path) -> str:
-        """Get the tensor name prefix for the Dense layer from module path."""
-        tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3"
-        return tensor_name

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
@@ -5335,10 +5141,6 @@ class EmbeddingGemma(Gemma3Model):
            logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
                        f"instead of {self.hparams['sliding_window']}")
            self.gguf_writer.add_sliding_window(orig_sliding_window)
-        if self.sentence_transformers_dense_modules:
-            for dense, dims in self.dense_features_dims.items():
-                logger.info(f"Setting dense layer {dense} in/out features to {dims}")
-                self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1])

        self._try_set_pooling_type()

@@ -6149,34 +5951,9 @@ class SeedOssModel(TextModel):


@ModelBase.register("Olmo2ForCausalLM")
-@ModelBase.register("Olmo3ForCausalLM")
 class Olmo2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.OLMO2

-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-
-        if "sliding_window" in self.hparams:
-            self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
-
-            sliding_window_pattern = []
-            if "layer_types" in self.hparams:
-                sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
-            else:
-                # Olmo2 does not use sliding window attention.
-                # Olmo3 defaults to using sliding window for all layers except every 4th.
-                for i in range(self.hparams["num_hidden_layers"]):
-                    sliding_window_pattern.append((i + 1) % 4 != 0)
-
-            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
-

@ModelBase.register("OlmoeForCausalLM")
 class OlmoeModel(TextModel):
@@ -6924,8 +6701,6 @@ class T5Model(TextModel):
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
-        if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
-            self.gguf_writer.add_decoder_block_count(dec_n_layer)
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
        self.gguf_writer.add_key_length(self.hparams["d_kv"])
        self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -7789,21 +7564,6 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
            if i not in self._attn_layers
        ]

-        # There are some models in this family that are non-hybrid, but keep the
-        # same parent class by setting all layers to "attention." If this is the
-        # case, the model architecture needs to be updated to a standard
-        # "granite" or "granitemoe" model
-        if not self._ssm_layers:
-            has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
-            new_arch = (
-                gguf.MODEL_ARCH.GRANITE_MOE
-                if has_experts else
-                gguf.MODEL_ARCH.GRANITE
-            )
-            self.model_arch = new_arch
-            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
-            self.gguf_writer.add_architecture()
-
        # n_group and d_inner are used during reshape_tensors for mamba2
        # NOTE: Explicitly include hparam prefix prefix for d_model to
        #   disambiguate with top-level head_dim
@@ -7888,11 +7648,8 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
            self.gguf_writer.add_rope_dimension_count(rope_dim)
        self.gguf_writer.add_head_count_kv(head_count_kv_vec)

-        ## If Bamba or non-hybrid, use rope, otherwise don't
-        use_rope = (
-            "BambaForCausalLM" in self.hparams["architectures"]
-            or not self._ssm_layers
-        )
+        ## If Bamba, use rope, otherwise don't
+        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
        self.gguf_writer.add_rope_scaling_finetuned(use_rope)
        if not use_rope:
            self.gguf_writer.add_context_length(2**20)
@@ -8063,121 +7820,6 @@ class BailingMoeModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
-class GroveMoeModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.GROVEMOE
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if (n_experts := self.hparams.get("num_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
-            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
-            logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
-        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299
-        self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128)
-        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298
-        self.gguf_writer.add_experts_per_group(2)
-        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
-        self.gguf_writer.add_expert_group_scale(0.05)
-        # YaRN is not enabled by default
-        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-    _chunk_experts: list[dict[str, Tensor]] | None = None
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name.endswith(".expert_bias"):
-            # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303
-            return []
-
-        # process the experts separately
-        if name.find("chunk_experts") != -1:
-            n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
-            assert bid is not None
-
-            if self._chunk_experts is None:
-                self._chunk_experts = [{} for _ in range(self.block_count)]
-
-            self._chunk_experts[bid][name] = data_torch
-
-            if len(self._chunk_experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight"
-                        datas.append(self._chunk_experts[bid][ename])
-                        del self._chunk_experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-        elif name.find("experts") != -1:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._chunk_experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            chunk_experts = [k for d in self._chunk_experts for k in d.keys()]
-            if len(chunk_experts) > 0:
-                raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}")
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@ModelBase.register("ChameleonForConditionalGeneration")
@ModelBase.register("ChameleonForCausalLM")  # obsolete
 class ChameleonModel(TextModel):
@@ -8540,76 +8182,6 @@ class HunYuanMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
-class LLaDAMoEModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.LLADA_MOE
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if (n_experts := self.hparams.get("num_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-
-        if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
-            self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
-
-        # number of experts used per token (top-k)
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-
-        self.gguf_writer.add_mask_token_id(156895)
-        self.gguf_writer.add_causal_attention(False)
-        self.gguf_writer.add_diffusion_shift_logits(False)
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    # Copied from: Qwen2MoeModel
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # process the experts separately
-        if name.find("experts") != -1:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
-                return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    # Copied from: Qwen2MoeModel
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
@@ -8890,75 +8462,6 @@ class LFM2Model(TextModel):
        return [(self.map_tensor_name(name), data_torch)]


-@ModelBase.register("Lfm2MoeForCausalLM")
-class LFM2MoeModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.LFM2MOE
-
-    def set_gguf_parameters(self):
-        # set num_key_value_heads only for attention layers
-        self.hparams["num_key_value_heads"] = [
-            self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0
-            for layer_type in self.hparams["layer_types"]
-        ]
-
-        super().set_gguf_parameters()
-
-        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
-        self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
-        self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
-        self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"])
-
-    # cache for experts weights for merging
-    _experts_cache: dict[int, dict[str, Tensor]] = {}
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # conv op requires 2d tensor
-        if 'conv.conv' in name:
-            data_torch = data_torch.squeeze(1)
-
-        if name.endswith(".expert_bias"):
-            name = name.replace(".expert_bias", ".expert_bias.bias")
-
-        # merge expert weights
-        if 'experts' in name:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            expert_cache = self._experts_cache.setdefault(bid, {})
-            expert_cache[name] = data_torch
-            expert_weights = ["w1", "w2", "w3"]
-
-            # not enough expert weights to merge
-            if len(expert_cache) < n_experts * len(expert_weights):
-                return []
-
-            tensors: list[tuple[str, Tensor]] = []
-            for w_name in expert_weights:
-                datas: list[Tensor] = []
-
-                for xid in range(n_experts):
-                    ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight"
-                    datas.append(expert_cache[ename])
-                    del expert_cache[ename]
-
-                data_torch = torch.stack(datas, dim=0)
-                merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight"
-                new_name = self.map_tensor_name(merged_name)
-                tensors.append((new_name, data_torch))
-
-            del self._experts_cache[bid]
-            return tensors
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-        assert not self._experts_cache
-
-
@ModelBase.register("Lfm2VlForConditionalGeneration")
 class LFM2VLModel(MmprojModel):
    def __init__(self, *args, **kwargs):
@@ -9077,43 +8580,6 @@ class SmallThinkerModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("ApertusForCausalLM")
-class ApertusModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.APERTUS
-    undo_permute = False
-
-    _alpha_n = {}
-    _alpha_p = {}
-    _beta = {}
-    _eps = {}
-
-    def modify_tensors(self, data_torch, name, bid):
-        # Handle xIELU activation parameters
-        n_layers = self.hparams["num_hidden_layers"]
-        if name.endswith(".act_fn.alpha_n"):
-            self._alpha_n[bid] = data_torch.to("cpu").float().item()
-            if (len(self._alpha_n) == n_layers):
-                self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)])
-            return []
-        if name.endswith(".act_fn.alpha_p"):
-            self._alpha_p[bid] = data_torch.to("cpu").float().item()
-            if (len(self._alpha_p) == n_layers):
-                self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)])
-            return []
-        if name.endswith(".act_fn.beta"):
-            self._beta[bid] = data_torch.to("cpu").float().item()
-            if (len(self._beta) == n_layers):
-                self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)])
-            return []
-        if name.endswith(".act_fn.eps"):
-            self._eps[bid] = data_torch.to("cpu").float().item()
-            if (len(self._eps) == n_layers):
-                self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)])
-            return []
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
 class MistralModel(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA
    model_name = "Mistral"
@@ -9281,7 +8747,7 @@ class LazyTorchTensor(gguf.LazyBase):
    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
        dtype = cls._dtype_str_map[st_slice.get_dtype()]
        shape: tuple[int, ...] = tuple(st_slice.get_shape())
-        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:])
+        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
        return cast(torch.Tensor, lazy)

    @classmethod
@@ -9389,13 +8855,6 @@ def parse_args() -> argparse.Namespace:
        )
    )

-    parser.add_argument(
-        "--sentence-transformers-dense-modules", action="store_true",
-        help=("Whether to include sentence-transformers dense modules."
-              "It can be used for sentence-transformers models, like google/embeddinggemma-300m"
-              "Default these modules are not included.")
-    )
-
    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
        parser.error("the following arguments are required: model")
@@ -9458,13 +8917,9 @@ def main() -> None:
    if args.remote:
        hf_repo_id = args.model
        from huggingface_hub import snapshot_download
-        allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"]
-        if args.sentence_transformers_dense_modules:
-            # include sentence-transformers dense modules safetensors files
-            allowed_patterns.append("*.safetensors")
        local_dir = snapshot_download(
            repo_id=hf_repo_id,
-            allow_patterns=allowed_patterns)
+            allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
        dir_model = Path(local_dir)
        logger.info(f"Downloaded config and tokenizer to {local_dir}")
    else:
@@ -9532,8 +8987,7 @@ def main() -> None:
                                     split_max_tensors=args.split_max_tensors,
                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
                                     small_first_shard=args.no_tensor_first_split,
-                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
-                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
+                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template
                                     )

        if args.vocab_only:
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,8 +139,6 @@ models = [
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
-    {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
-    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -160,7 +158,6 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
-    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
 ]


--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -314,11 +314,3 @@ Converting the matmul weight format from ND to NZ to improve performance. Enable
 ### GGML_CANN_ACL_GRAPH

 Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
-
-### GGML_CANN_GRAPH_CACHE_CAPACITY
-
-Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted.
-
-### GGML_CANN_PREFILL_USE_GRAPH
-
-Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled.
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -145,13 +145,12 @@ The docker build option is currently limited to *Intel GPU* targets.
 ```sh
 # Using FP16
 docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
-
-# Using FP32
-docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=OFF" --target light -f .devops/intel.Dockerfile .
 ```

 *Notes*:

+To build in default FP32 *(Slower than FP16 alternative)*, set `--build-arg="GGML_SYCL_F16=OFF"` in the previous command.
+
 You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
 Check the [documentation for Docker](../docker.md) to see the available images.

@@ -161,7 +160,7 @@ Check the [documentation for Docker](../docker.md) to see the available images.
 # First, find all the DRI cards
 ls -la /dev/dri
 # Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
-docker run -it --rm -v "/path/to/models:/models" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card0:/dev/dri/card0 llama-cpp-sycl -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -c 4096 -s 0
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```

 *Notes:*
@@ -216,19 +215,9 @@ To target AMD GPUs with SYCL, the ROCm stack must be installed first.

 2. **Install Intel® oneAPI Base toolkit**

-SYCL backend depends on:
-  - Intel® oneAPI DPC++/C++ compiler/running-time.
-  - Intel® oneAPI DPC++/C++ library (oneDPL).
-  - Intel® oneAPI Deep Neural Network Library (oneDNN).
-  - Intel® oneAPI Math Kernel Library (oneMKL).
-
 - **For Intel GPU**

-All above are included in both **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** packages.
-
-It's recommended to install **Intel® Deep Learning Essentials** which only provides the necessary libraries with less size.
-
-The **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

 Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.

@@ -236,12 +225,6 @@ Following guidelines/code snippets assume the default installation values. Other

 Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI oneDNN for Intel GPUs.

-|Verified release|
-|-|
-|2025.2.1|
-|2025.1|
-|2024.1|
-
 - **Adding support to Nvidia GPUs**

 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
@@ -272,11 +255,10 @@ sycl-ls
 When targeting an intel GPU, the user should expect one or more devices among the available SYCL devices. Please make sure that at least one GPU is present via `sycl-ls`, for instance `[level_zero:gpu]` in the sample output below:

 ```
-[level_zero:gpu][level_zero:0] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) Arc(TM) A770 Graphics 12.55.8 [1.3.29735+27]
-[level_zero:gpu][level_zero:1] Intel(R) oneAPI Unified Runtime over Level-Zero, Intel(R) UHD Graphics 730 12.2.0 [1.3.29735+27]
-[opencl:cpu][opencl:0] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i5-13400 OpenCL 3.0 (Build 0) [2025.20.8.0.06_160000]
-[opencl:gpu][opencl:1] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [24.39.31294]
-[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) UHD Graphics 730 OpenCL 3.0 NEO  [24.39.31294]
+[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```

 - **Nvidia GPU**
@@ -371,7 +353,7 @@ cmake --build build --config Release -j -v

 #### Retrieve and prepare model

-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/resolve/main/llama-2-7b.Q4_0.gguf?download=true) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model preparation, or download an already quantized model like [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) or [Meta-Llama-3-8B-Instruct-Q4_0.gguf](https://huggingface.co/aptha/Meta-Llama-3-8B-Instruct-Q4_0-GGUF/resolve/main/Meta-Llama-3-8B-Instruct-Q4_0.gguf).

 ##### Check device

@@ -484,17 +466,7 @@ If you already have a recent version of Microsoft Visual Studio, you can skip th

 3. Install Intel® oneAPI Base toolkit

-SYCL backend depends on:
-  - Intel® oneAPI DPC++/C++ compiler/running-time.
-  - Intel® oneAPI DPC++/C++ library (oneDPL).
-  - Intel® oneAPI Deep Neural Network Library (oneDNN).
-  - Intel® oneAPI Math Kernel Library (oneMKL).
-
-All above are included in both **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** packages.
-
-It's recommended to install **Intel® Deep Learning Essentials** which only provides the necessary libraries with less size.
-
-The **Intel® oneAPI Base toolkit** and **Intel® Deep Learning Essentials** can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
+The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

 Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.

--- a/docs/backend/zDNN.md
+++ b/docs/backend/zDNN.md
@@ -1,61 +0,0 @@
-# llama.cpp for IBM zDNN Accelerator
-
-## Background
-
-IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
-
-### Llama.cpp + IBM zDNN
-
-The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
-
-## Software & Hardware Support
-
-| Hardware Level       | Status        | Verified                   |
-| -------------------- | ------------- | -------------------------- |
-| IBM z17 / LinuxONE 5 | Supported     | RHEL 9.6, IBM z17, 40 IFLs |
-| IBM z16 / LinuxONE 4 | Not Supported |                            |
-
-## Data Types Supported
-
-| Data Type | Status    |
-| --------- | --------- |
-| F32       | Supported |
-| F16       | Supported |
-| BF16      | Supported |
-
-## CMake Options
-
-The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
-
-| CMake Option | Default Value | Description                         |
-| ------------ | ------------- | ----------------------------------- |
-| `GGML_ZDNN`  | `OFF`         | Compile llama.cpp with zDNN support |
-| `ZDNN_ROOT`  | `""`          | Override zDNN library lookup        |
-
-## 1. Install zDNN Library
-
-Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
-
-```sh
-git clone --recurse-submodules https://github.com/IBM/zDNN
-cd zDNN
-
-autoreconf .
-./configure --prefix=/opt/zdnn-libs
-
-make build
-sudo make install
-```
-
-## 2. Build llama.cpp
-
-```sh
-git clone https://github.com/ggml-org/llama.cpp
-cd llama.cpp
-
-cmake -S . -G Ninja -B build \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DGGML_ZDNN=ON \
-    -DZDNN_ROOT=/opt/zdnn-libs
-cmake --build build --config Release -j$(nproc)
-```
--- a/docs/build-riscv64-spacemit.md
+++ b/docs/build-riscv64-spacemit.md
@@ -1,89 +0,0 @@
-> [!IMPORTANT]
-> This build documentation is specific only to RISC-V SpacemiT SOCs.
-
-## Build llama.cpp locally (for riscv64)
-
-1. Prepare Toolchain For RISCV
-~~~
-wget https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
-~~~
-
-2. Build
-Below is the build script: it requires utilizing RISC-V vector instructions for acceleration. Ensure the `GGML_CPU_RISCV64_SPACEMIT` compilation option is enabled. The currently supported optimization version is `RISCV64_SPACEMIT_IME1`, corresponding to the `RISCV64_SPACEMIT_IME_SPEC` compilation option. Compiler configurations are defined in the `riscv64-spacemit-linux-gnu-gcc.cmake` file. Please ensure you have installed the RISC-V compiler and set the environment variable via `export RISCV_ROOT_PATH={your_compiler_path}`.
-```bash
-
-cmake -B build \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DGGML_CPU_RISCV64_SPACEMIT=ON \
-    -DLLAMA_CURL=OFF \
-    -DGGML_RVV=ON \
-    -DGGML_RV_ZFH=ON \
-    -DGGML_RV_ZICBOP=ON \
-    -DRISCV64_SPACEMIT_IME_SPEC=RISCV64_SPACEMIT_IME1 \
-    -DCMAKE_TOOLCHAIN_FILE=${PWD}/cmake/riscv64-spacemit-linux-gnu-gcc.cmake \
-    -DCMAKE_INSTALL_PREFIX=build/installed
-
-cmake --build build --parallel $(nproc) --config Release
-
-pushd build
-make install
-popd
-```
-
-## Simulation
-You can use QEMU to perform emulation on non-RISC-V architectures.
-
-1. Download QEMU
-~~~
-wget https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz
-~~~
-
-2. Run Simulation
-After build your llama.cpp, you can run the executable file via QEMU for simulation, for example:
-~~~
-export QEMU_ROOT_PATH={your QEMU file path}
-export RISCV_ROOT_PATH_IME1={your RISC-V compiler path}
-
-${QEMU_ROOT_PATH}/bin/qemu-riscv64 -L ${RISCV_ROOT_PATH_IME1}/sysroot -cpu max,vlen=256,elen=64,vext_spec=v1.0 ${PWD}/build/bin/llama-cli -m ${PWD}/models/Qwen2.5-0.5B-Instruct-Q4_0.gguf -t 1
-~~~
-## Performance
-#### Quantization Support For Matrix
-~~~
-model name      : Spacemit(R) X60
-isa             : rv64imafdcv_zicbom_zicboz_zicntr_zicond_zicsr_zifencei_zihintpause_zihpm_zfh_zfhmin_zca_zcd_zba_zbb_zbc_zbs_zkt_zve32f_zve32x_zve64d_zve64f_zve64x_zvfh_zvfhmin_zvkt_sscofpmf_sstc_svinval_svnapot_svpbmt
-mmu             : sv39
-uarch           : spacemit,x60
-mvendorid       : 0x710
-marchid         : 0x8000000058000001
-~~~
-
-Q4_0
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | pp512|64.12 ± 0.26|
-Qwen2.5 0.5B |403.20 MiB|630.17 M|   cpu   |    4    | tg128|10.03 ± 0.01|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | pp512|24.16 ± 0.02|
-Qwen2.5 1.5B |1011.16 MiB| 1.78 B |   cpu   |    4    | tg128|3.83 ± 0.06|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | pp512|12.08 ± 0.02|
-Qwen2.5 3B   | 1.86 GiB  | 3.40 B |   cpu   |    4    | tg128|2.23 ± 0.02|
-
-Q4_1
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | pp512|62.07 ± 0.12|
-Qwen2.5 0.5B |351.50 MiB|494.03 M|   cpu   |    4    | tg128|9.91 ± 0.01|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | pp512|22.95 ± 0.25|
-Qwen2.5 1.5B |964.06 MiB| 1.54 B |   cpu   |    4    | tg128|4.01 ± 0.15|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | pp512|11.55 ± 0.16|
-Qwen2.5 3B   | 1.85 GiB | 3.09 B |   cpu   |    4    | tg128|2.25 ± 0.04|
-
-
-Q4_K
-|   Model    |   Size   | Params | backend | threads | test | t/s |
-| -----------| -------- | ------ | ------- | ------- | ---- |------|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | pp512|9.29 ± 0.05|
-Qwen2.5 0.5B |462.96 MiB|630.17 M|   cpu   |    4    | tg128|5.67 ± 0.04|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | pp512|10.38 ± 0.10|
-Qwen2.5 1.5B | 1.04 GiB | 1.78 B |   cpu   |    4    | tg128|3.17 ± 0.08|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | pp512|4.23 ± 0.04|
-Qwen2.5 3B   | 1.95 GiB | 3.40 B |   cpu   |    4    | tg128|1.73 ± 0.00|
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -241,8 +241,8 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 |            | VX/VXE/VXE2 | zDNN | Spyre |
 |------------|-------------|------|-------|
 | FP32       | ✅           | ✅    | ❓     |
-| FP16       | ✅           | ✅    | ❓     |
-| BF16       | 🚫           | ✅    | ❓     |
+| FP16       | ✅           | ❓    | ❓     |
+| BF16       | 🚫           | ❓    | ❓     |
 | Q4_0       | ✅           | ❓    | ❓     |
 | Q4_1       | ✅           | ❓    | ❓     |
 | MXFP4      | 🚫           | ❓    | ❓     |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself

-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 6, 2025.
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment

 The defaults are:

- `MUSA_VERSION` set to `rc4.3.0`
+- `MUSA_VERSION` set to `rc4.2.0`

 The resulting images, are essentially the same as the non-MUSA images:

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -18,7 +18,6 @@ Legend:
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -27,7 +26,6 @@ Legend:
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
@@ -51,11 +49,9 @@ Legend:
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
@@ -65,9 +61,7 @@ Legend:
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
@@ -104,7 +98,6 @@ Legend:
 |                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ else()

    add_subdirectory(gguf-hash)
    add_subdirectory(gguf)
+    add_subdirectory(gritlm)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(parallel)
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -e
+
+AI_NAME="${AI_NAME:-Miku}"
+MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
+USER_NAME="${USER_NAME:-Anon}"
+
+# Uncomment and adjust to the number of CPU cores you want to use.
+#N_THREAD="${N_THREAD:-4}"
+CTX_SIZE="${CTX_SIZE:-4096}"
+N_PREDICTS="${N_PREDICTS:-4096}"
+
+GEN_OPTIONS=(--batch_size 1024
+--ctx_size "$CTX_SIZE"
+--keep -1
+--repeat_last_n 256
+--repeat_penalty 1.17647
+--temp 0.6
+--mirostat 2)
+
+if [ -n "$N_THREAD" ]; then
+    GEN_OPTIONS+=(--threads "$N_THREAD")
+fi
+
+./llama-cli "${GEN_OPTIONS[@]}" \
+    --model "$MODEL" \
+    --in-prefix " " \
+    --in-suffix "${AI_NAME}:" \
+    --n_predict "$N_PREDICTS" \
+    --color --interactive \
+    --reverse-prompt "${USER_NAME}:" \
+    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
+${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
+${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
+${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
+The conversation is only between ${USER_NAME} and ${AI_NAME}.
+The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
+${AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+${USER_NAME}: Hello!
+${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
+${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
+${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
+${AI_NAME}: What do you like to do in your free time? ^_^
+${USER_NAME}:" "$@"
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -0,0 +1,57 @@
+@setlocal disabledelayedexpansion enableextensions
+@echo off
+
+cd /d "%~dp0.."
+if not "%errorlevel%"=="0" (
+    echo Unable to change directory.
+    pause
+    exit /b 1
+)
+
+if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
+if not defined USER_NAME set "USER_NAME=User"
+if not defined AI_NAME set "AI_NAME=ChatLLaMa"
+rem Adjust to the number of CPU cores you want to use.
+rem if not defined N_THREAD set "N_THREAD=8"
+rem Number of tokens to predict (made it larger than default because we want a long interaction)
+if not defined N_PREDICTS set "N_PREDICTS=2048"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+
+rem Default main script paths
+set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
+
+rem Get main script path from command line arguments
+set "MAIN_SCRIPT_PATH=%~1"
+
+rem If the main script path was not specified, try the default paths
+if not defined MAIN_SCRIPT_PATH (
+    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
+        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
+    )
+)
+
+rem If the main script path was not found, tell the user how to specify it
+if not defined MAIN_SCRIPT_PATH (
+    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
+    echo %DEFAULT_MAIN_SCRIPT_PATHS%
+    pause
+    exit /b 1
+)
+
+rem Default context, feel free to edit it
+set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
+
+rem Set a temporary variable if N_THREAD is set
+if defined N_THREAD (
+    set "_N_THREAD=--threads %N_THREAD%"
+) else (
+    set "_N_THREAD="
+)
+
+rem Run the script
+echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
+  --model "%MODEL%" ^
+  --n_predict %N_PREDICTS% ^
+  --color --interactive ^
+  --reverse-prompt "%USER_NAME%:" ^
+  --prompt "%PROMPT_TEXT%"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -e
+
+cd "$(dirname "$0")/.." || exit
+
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
+USER_NAME="${USER_NAME:-USER}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-8}"
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+N_PREDICTS="${N_PREDICTS:-2048}"
+
+# Note: you can also override the generation options by specifying them on the command line:
+# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+
+DATE_TIME=$(date +%H:%M)
+DATE_YEAR=$(date +%Y)
+
+PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+
+sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
+    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
+    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
+    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
+     $PROMPT_TEMPLATE > $PROMPT_FILE
+
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./llama-cli $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --n_predict "$N_PREDICTS" \
+  --color --interactive \
+  --file ${PROMPT_FILE} \
+  --reverse-prompt "${USER_NAME}:" \
+  --in-prefix ' ' \
+  "$@"
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")/.." || exit
+
+if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
+    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
+    exit 1
+fi
+
+MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
+PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
+USER_NAME="${USER_NAME:-User}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+DATE_TIME="$(date +%H:%M)"
+DATE_YEAR="$(date +%Y)"
+
+LOG="${CHAT_SAVE_DIR}/main.log"
+LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
+CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
+CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
+NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
+NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
+
+SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
+'|'\
+'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
+
+CTX_SIZE=2048
+CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+
+# An unbuffered `tail -c+N`
+skip_bytes() {
+    LANG=C IFS= read -r -n "$1" -d '' c
+    while LANG=C IFS= read -r -n 1 -d '' c; do
+        printf '%s' "$c"
+    done
+}
+
+mkdir -p "$CHAT_SAVE_DIR"
+echo >"$LOG"
+trap "tail -n100 ${LOG}" EXIT
+
+if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
+    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
+        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
+        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
+        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
+        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
+fi
+
+if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
+    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+fi
+
+if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
+    echo '...' >>"$NEXT_PROMPT_FILE"
+fi
+
+if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
+    echo 'Prompt cache does not exist, building...'
+    # Default batch_size to 64 here for better user feedback during initial prompt processing
+    ./llama-cli 2>>"$LOG" \
+        --batch_size 64 \
+        "${OPTS[@]}" \
+        --prompt-cache "$PROMPT_CACHE_FILE" \
+        --file "$CUR_PROMPT_FILE" \
+        --n_predict 1
+    echo
+    echo 'Done!'
+fi
+
+if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
+fi
+if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+fi
+
+printf '%s ' "$(< "$CUR_PROMPT_FILE")"
+n_tokens=0
+
+while read -e line; do
+    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
+    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+
+    # Swap prompts when we're about to run out of context
+    if ((n_predict <= 0)); then
+        wait # for background main (below) to finish with next prompt
+        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
+        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
+
+        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+        echo '...' >>"$NEXT_PROMPT_FILE"
+        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+
+        n_tokens=0
+        n_predict=$((CTX_SIZE / 2))
+    fi
+
+    echo " ${line}" >>"$CUR_PROMPT_FILE"
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        echo " ${line}" >>"$NEXT_PROMPT_FILE"
+    fi
+
+    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
+
+    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
+
+    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
+            --prompt-cache "$CUR_PROMPT_CACHE" \
+            --prompt-cache-all \
+            --file "$CUR_PROMPT_FILE" \
+            --reverse-prompt "${USER_NAME}:" \
+            --n_predict "$n_predict" |
+        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
+        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
+        skip_bytes "$n_prompt_len_pre"  # print generation
+
+    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
+
+    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
+    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
+        printf '\n%s:' "$USER_NAME"
+        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
+    fi
+
+    printf ' '
+
+    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
+        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
+        exit 1
+    fi
+
+    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
+
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
+    fi
+
+    # Update cache for next prompt in background, ideally during user input
+    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+          --prompt-cache "$NEXT_PROMPT_CACHE" \
+          --file "$NEXT_PROMPT_FILE" \
+          --n_predict 1 &
+done
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -e
+
+cd "$(dirname "$0")/.." || exit
+
+MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
+USER_NAME="### Human"
+AI_NAME="### Assistant"
+
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-8}"
+# Number of tokens to predict (made it larger than default because we want a long interaction)
+N_PREDICTS="${N_PREDICTS:-2048}"
+
+# Note: you can also override the generation options by specifying them on the command line:
+# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+
+DATE_TIME=$(date +%H:%M)
+DATE_YEAR=$(date +%Y)
+
+PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+
+sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
+    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
+    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
+    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
+     $PROMPT_TEMPLATE > $PROMPT_FILE
+
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./bin/llama-cli $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --n_predict "$N_PREDICTS" \
+  --color --interactive \
+  --file ${PROMPT_FILE} \
+  --reverse-prompt "### Human:" \
+  --in-prefix ' ' \
+  "$@"
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+# Important:
+#
+#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
+#
+./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+    --repeat_penalty 1.0 --color -i \
+    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -510,27 +510,19 @@ static void diffusion_generate(llama_context *          ctx,
    n_generated = params.max_length;
 }

-static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
+static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
    if (!use_chat_template) {
        return prompt;
    }

    auto chat_templates = common_chat_templates_init(model, "");
+
    common_chat_templates_inputs inputs;
-    common_chat_msg system_msg;
-
-    if (!system_prompt.empty()) {
-        system_msg.role = "system";
-        system_msg.content = system_prompt;
-        inputs.messages.push_back(system_msg);
-    }
-
-    common_chat_msg user_msg;
-    user_msg.role = "user";
-    user_msg.content = prompt;
-
-    inputs.messages.push_back(user_msg);
+    common_chat_msg              user_msg;
+    user_msg.role                = "user";
+    user_msg.content             = prompt;
    inputs.add_generation_prompt = true;
+    inputs.messages.push_back(user_msg);

    auto result = common_chat_templates_apply(chat_templates.get(), inputs);

@@ -587,8 +579,7 @@ int main(int argc, char ** argv) {
    llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);

    const llama_vocab * vocab            = llama_model_get_vocab(model);
-
-    std::string         formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model);
+    std::string         formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);

    std::vector<llama_token> input_tokens = common_tokenize(vocab,
                                                            formatted_prompt,
@@ -605,7 +596,6 @@ int main(int argc, char ** argv) {
    }

    llama_token mask_token_id = llama_vocab_mask(vocab);
-
    GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);

    bool visual_mode = params.diffusion.visual_mode;
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -43,8 +43,8 @@ The above command will output space-separated float values.
 | $"string"$   | |
 |--------------|-|
 | "\n"         | (default)
-| "<#embSep#>" | for example
-| "<#sep#>"    | other example
+| "<#embSep#>" | for exemple
+| "<#sep#>"    | other exemple

 ## examples
 ### Unix-based systems (Linux, macOS, etc.):
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
        params.n_batch = params.n_ctx;
    }

-    // for non-causal models, batch size must be equal to ubatch size
-    if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
-        params.n_ubatch = params.n_batch;
-    }
-
-    // get max number of sequences per batch
-    const int n_seq_max = llama_max_parallel_sequences();
+    // For non-causal models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;

    llama_backend_init();
    llama_numa_init(params.numa);
@@ -149,7 +144,6 @@ int main(int argc, char ** argv) {
    // get added sep and eos token, if any
    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
-    const char * rerank_prompt = llama_model_chat_template(model, "rerank");

    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
@@ -159,28 +153,21 @@ int main(int argc, char ** argv) {
        // split classification pairs and insert expected separator tokens
        if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
            std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
-            if (rerank_prompt != nullptr) {
-                const std::string query = pairs[0];
-                const std::string doc = pairs[1];
-                std::string final_prompt = rerank_prompt;
-                string_replace_all(final_prompt, "{query}"   , query);
-                string_replace_all(final_prompt, "{document}", doc  );
-                inp = common_tokenize(vocab, final_prompt, true, true);
-            } else {
-                std::string final_prompt;
-                for (size_t i = 0; i < pairs.size(); i++) {
-                    final_prompt += pairs[i];
-                    if (i != pairs.size() - 1) {
-                        if (!added_eos_token.empty()) {
-                            final_prompt += added_eos_token;
-                        }
-                        if (!added_sep_token.empty()) {
-                            final_prompt += added_sep_token;
-                        }
+            std::string final_prompt;
+
+            for (size_t i = 0; i < pairs.size(); i++) {
+                final_prompt += pairs[i];
+                if (i != pairs.size() - 1) {
+                    if (!added_eos_token.empty()) {
+                        final_prompt += added_eos_token;
+                    }
+                    if (!added_sep_token.empty()) {
+                        final_prompt += added_sep_token;
                    }
                }
-                inp = common_tokenize(ctx, final_prompt, true, true);
            }
+
+            inp = common_tokenize(ctx, final_prompt, true, true);
        } else {
            inp = common_tokenize(ctx, prompt, true, true);
        }
@@ -242,7 +229,7 @@ int main(int argc, char ** argv) {
        const uint64_t n_toks = inp.size();

        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
+        if (batch.n_tokens + n_toks > n_batch) {
            float * out = emb + e * n_embd;
            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -5,11 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TEST_TARGET test-eval-callback)
-if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
-        add_test(NAME ${TEST_TARGET}
-                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
-else()
-        add_test(NAME ${TEST_TARGET}
-                        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K-be.gguf --model stories260K-be.gguf --prompt hello --seed 42 -ngl 0)
-endif()
+add_test(NAME ${TEST_TARGET}
+        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-gritlm)
+add_executable(${TARGET} gritlm.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -0,0 +1,62 @@
+## Generative Representational Instruction Tuning (GRIT) Example
+[gritlm] a model which can generate embeddings as well as "normal" text
+generation depending on the instructions in the prompt.
+
+* Paper: https://arxiv.org/pdf/2402.09906.pdf
+
+### Retrieval-Augmented Generation (RAG) use case
+One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
+that we take documents that we want to use as context, to ground the large
+language model (LLM), and we create token embeddings for them. We then store
+these token embeddings in a vector database.
+
+When we perform a query, prompt the LLM, we will first create token embeddings
+for the query and then search the vector database to retrieve the most
+similar vectors, and return those documents so they can be passed to the LLM as
+context. Then the query and the context will be passed to the LLM which will
+have to _again_ create token embeddings for the query. But because gritlm is used
+the first query can be cached and the second query tokenization generation does
+not have to be performed at all.
+
+### Running the example
+Download a Grit model:
+```console
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
+```
+
+Run the example using the downloaded model:
+```console
+$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
+
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
+Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
+Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+When shadows lurk and ghosts do roam,
+And darkness reigns, a fearsome sight.
+
+Thou didst set out, with heart aglow,
+To conquer this mountain, so high,
+And reach the summit, where the stars do glow,
+And the moon shines bright, up in the sky.
+
+Through the mist and fog, thou didst press on,
+With steadfast courage, and a steadfast will,
+Through the darkness, thou didst not be gone,
+But didst climb on, with a steadfast skill.
+
+At last, thou didst reach the summit's crest,
+And gazed upon the world below,
+And saw the beauty of the night's best,
+And felt the peace, that only nature knows.
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+Thou art a hero, in the eyes of all,
+For thou didst conquer this mountain, so bright.
+```
+
+[gritlm]: https://github.com/ContextualAI/gritlm
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -0,0 +1,231 @@
+#include "arg.h"
+#include "common.h"
+#include "llama.h"
+
+#include <string>
+#include <vector>
+
+// #define GRIT_DEBUG
+
+static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
+    std::vector<std::vector<float>> result;
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+    for (uint64_t i = 0; i < sentences.size(); i++) {
+        common_batch_clear(batch);
+
+        const std::string input_string = instruction + sentences[i];
+
+        std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
+
+        const int32_t n_toks = inputs.size();
+
+        // GritLM seems to have EOS = ""
+        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
+        // inputs.push_back(llama_vocab_eos(vocab));
+
+        // we want to ignore instruction tokens for mean pooling
+        const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
+
+#ifdef GRIT_DEBUG
+        // debug tokens - should be matching as referenced in the GritLM sample
+        std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
+            std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
+        });
+        std::printf("\n");
+#endif
+
+        // add input to batch (this increments n_tokens)
+        for (int32_t j = 0; j < n_toks; j++) {
+            common_batch_add(batch, inputs[j], j, { 0 }, true);
+        }
+
+        // clear previous kv_cache values (irrelevant for embeddings)
+        llama_memory_clear(llama_get_memory(ctx), true);
+        llama_set_causal_attn(ctx, false);
+
+        // run model
+        llama_decode(ctx, batch);
+
+        // get embedding dimensions
+        uint64_t n_embd = llama_model_n_embd(model);
+
+        // allocate embedding output
+        std::vector<float> emb_unorm(n_embd, 0.0f);
+
+        // sum up all token embeddings
+        for (int32_t k = n_inst; k < n_toks; k++) {
+            float * emb = llama_get_embeddings_ith(ctx, k);
+            for (uint64_t j = 0; j < n_embd; j++) {
+                emb_unorm[j] += emb[j];
+            }
+        }
+
+        // divide by number of tokens (mean pooling)
+        {
+            const uint64_t n_sent = n_toks - n_inst;
+
+            for (uint64_t j = 0; j < n_embd; j++) {
+                emb_unorm[j] /= n_sent;
+            }
+        }
+
+        std::vector<float> emb_norm(emb_unorm.size());
+        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
+        result.push_back(emb_norm);
+
+#ifdef GRIT_DEBUG
+        // print out emb_norm
+        std::printf("embedding %ld: ", i);
+        for (uint64_t j = 0; j < n_embd; j++) {
+            std::printf("%.5f ", emb_norm[j]);
+        }
+        std::printf("\n\n");
+#endif
+    }
+
+    llama_batch_free(batch);
+
+    return result;
+}
+
+static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
+    std::string result;
+
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    llama_token eos_token = llama_vocab_eos(vocab);
+
+    llama_memory_clear(llama_get_memory(ctx), true);
+    llama_set_causal_attn(ctx, true);
+
+    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
+
+    std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
+    int32_t i_current_token = 0;
+
+    while (true) {
+        common_batch_clear(bat);
+        {
+            const int32_t n_inputs = inputs.size();
+
+            for (int32_t i = 0; i < n_inputs; i++) {
+                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+            }
+        }
+        inputs.clear();
+
+        llama_decode(ctx, bat);
+
+        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+
+        if (token == eos_token) {
+            break;
+        }
+
+        std::string piece = common_token_to_piece(ctx, token);
+        if (stream) {
+            std::printf("%s", piece.c_str());
+            std::fflush(stdout);
+        }
+
+        inputs.push_back(token);
+
+        result += piece;
+    }
+
+    if (stream) {
+        std::printf("\n");
+    }
+
+    llama_batch_free(bat);
+
+    return result;
+}
+
+static std::string gritlm_instruction(const std::string & instruction) {
+    return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
+}
+
+int main(int argc, char * argv[]) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
+        return 1;
+    }
+
+    common_init();
+
+    llama_model_params mparams = common_model_params_to_llama(params);
+    llama_context_params cparams = common_context_params_to_llama(params);
+
+    cparams.embeddings = true;
+
+    llama_backend_init();
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+
+    // create generation context
+    llama_context * ctx = llama_init_from_model(model, cparams);
+
+    auto sparams = llama_sampler_chain_default_params();
+
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+    // ### Embedding/Representation ###
+    // samples taken from: https://github.com/ContextualAI/gritlm#basic
+    {
+        const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
+
+        const std::vector<std::string> queries = {
+            "Bitcoin: A Peer-to-Peer Electronic Cash System",
+            "Generative Representational Instruction Tuning",
+        };
+
+        const std::vector<std::string> documents = {
+            "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
+            "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
+        };
+
+        // No need to add instruction for retrieval documents
+        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
+        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
+
+        const int n_embd = llama_model_n_embd(model);
+
+        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
+        const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
+        const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
+
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
+        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
+    }
+
+    llama_set_embeddings(ctx, false);
+
+    // ### Generation ###
+    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
+    {
+        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
+        std::string response = generate(ctx, smpl, prompt, true);
+    }
+
+    llama_sampler_free(smpl);
+    llama_free(ctx);
+    llama_model_free(model);
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@@ -0,0 +1,21 @@
+# llama.cpp/example/jeopardy
+
+This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
+
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+
+
+Step 1: Open jeopardy.sh and modify the following:
+```
+MODEL=(path to your model)
+MODEL_NAME=(name of your model)
+prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
+opts=(add -instruct here if needed for your model, or anything else you want to test out)
+```
+Step 2: Run `jeopardy.sh` from the llama.cpp folder
+
+Step 3: Repeat steps 1 and 2 until you have all the results you need.
+
+Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
+
+Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+import matplotlib.pyplot as plt
+import os
+import csv
+
+labels = []
+numbers = []
+numEntries = 1
+
+rows = []
+
+
+def bar_chart(numbers, labels, pos):
+    plt.bar(pos, numbers, color='blue')
+    plt.xticks(ticks=pos, labels=labels)
+    plt.title("Jeopardy Results by Model")
+    plt.xlabel("Model")
+    plt.ylabel("Questions Correct")
+    plt.show()
+
+
+def calculatecorrect():
+    directory = os.fsencode("./examples/jeopardy/results/")
+    csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
+    for row in csv_reader:
+        global rows
+        rows.append(row)
+    for listing in os.listdir(directory):
+        filename = os.fsdecode(listing)
+        if filename.endswith(".txt"):
+            file = open("./examples/jeopardy/results/" + filename, "rt")
+            global labels
+            global numEntries
+            global numbers
+            labels.append(filename[:-4])
+            numEntries += 1
+            i = 1
+            totalcorrect = 0
+            for line in file.readlines():
+                if line.strip() != "------":
+                    print(line)
+                else:
+                    print("Correct answer: " + rows[i][2] + "\n")
+                    i += 1
+                    print("Did the AI get the question right? (y/n)")
+                    if input() == "y":
+                        totalcorrect += 1
+            numbers.append(totalcorrect)
+
+
+if __name__ == '__main__':
+    calculatecorrect()
+    pos = list(range(numEntries))
+    labels.append("Human")
+    numbers.append(48.11)
+    bar_chart(numbers, labels, pos)
+    print(labels)
+    print(numbers)
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -e
+
+MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
+MODEL_NAME=Vicuna
+
+# exec options
+prefix="Human: " # Ex. Vicuna uses "Human: "
+opts="--temp 0 -n 80" # additional flags
+nl='
+'
+introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
+
+# file options
+question_file=./examples/jeopardy/questions.txt
+touch ./examples/jeopardy/results/$MODEL_NAME.txt
+output_file=./examples/jeopardy/results/$MODEL_NAME.txt
+
+counter=1
+
+echo 'Running'
+while IFS= read -r question
+do
+  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  echo $counter
+  echo "Current Question: $question"
+  eval "$exe_cmd"
+  echo -e "\n------" >> $output_file
+  counter=$((counter+1))
+done < "$question_file"
--- a/examples/jeopardy/qasheet.csv
+++ b/examples/jeopardy/qasheet.csv
@@ -0,0 +1,103 @@
+Index,Original Category,Original Correct Question,Model Prompt
+1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
+3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
+4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
+5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
+7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
+8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
+9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
+10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
+12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
+13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
+14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
+15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
+16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
+18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
+19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
+20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
+21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
+23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
+25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
+27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
+28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
+29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
+30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
+31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
+32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
+33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
+34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
+35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
+36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
+37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?"
+38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?"
+39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
+40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
+42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
+43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
+45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
+46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
+47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
+48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
+49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
+52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
+53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
+54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
+55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
+56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
+57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
+58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
+61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
+62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
+63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
+64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
+65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
+66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
+70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
+71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
+72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
+74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
+75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
+76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
+77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
+78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
+79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
+80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
+81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
+82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
+83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
+86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
+87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
+88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
+90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
+91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
+93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
+94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
+96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
+97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
+98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
+99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
+,,,
+TOTALS,,,
--- a/examples/jeopardy/questions.txt
+++ b/examples/jeopardy/questions.txt
@@ -0,0 +1,100 @@
+Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
+Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
+James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
+England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
+In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
+Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
+Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
+What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
+A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
+Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
+The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
+In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
+What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
+What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
+In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
+At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
+Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
+In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
+A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
+After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
+The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
+This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
+An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
+Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
+What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
+A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
+Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
+Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
+The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
+For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
+Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
+In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
+In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
+In 2010 who introduced the 4-point shot, 35 feet from the basket?
+Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
+In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
+Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
+In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
+Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
+This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
+Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
+Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
+5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
+Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
+The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
+Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
+Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
+Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
+Like Sir Thomas More, 3 16th century English queens are buried at what British location?
+In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
+The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
+What was first sold in 1908, at a price equivalent to about $27,000 today?
+The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
+Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
+After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
+Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+Until a 1903 secession, what country's contiguous territory spanned 2 continents?
+Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
+Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
+Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
+Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
+Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
+Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
+In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
+In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
+In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
+The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
+What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
+What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
+Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
+Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
+The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
+Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
+A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
+Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
+Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
+The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
+What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -0,0 +1,28 @@
+" Basic plugin example
+
+function! Llm()
+
+  let url = "http://127.0.0.1:8080/completion"
+
+  " Get the content of the current buffer
+  let buffer_content = join(getline(1, '$'), "\n")
+
+  " Create the JSON payload
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
+  let json_payload.prompt = buffer_content
+
+  " Define the curl command
+  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
+  let response = system(curl_command, json_encode(json_payload))
+
+  " Extract the content field from the response
+  let content = json_decode(response).content
+
+  let split_newlines = split(content, '\n', 1)
+
+  " Insert the content at the cursor position
+  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
+endfunction
+
+command! Llm call Llm()
+noremap <F2> :Llm<CR>
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -116,38 +116,15 @@ embedding-convert-model:
 	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
 	./scripts/embedding/convert-model.sh

-embedding-convert-model-st:
-	$(call validate_embedding_model_path,embedding-convert-model-st)
-	@MODEL_NAME="$(MODEL_NAME)" OUTTYPE="$(OUTTYPE)" MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
-	METADATA_OVERRIDE="$(METADATA_OVERRIDE)" \
-	./scripts/embedding/convert-model.sh -st
-
 embedding-run-original-model:
 	$(call validate_embedding_model_path,embedding-run-original-model)
-	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" \
-	USE_SENTENCE_TRANSFORMERS="$(USE_SENTENCE_TRANSFORMERS)" \
-	./scripts/embedding/run-original-model.py \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
-	$(if $(USE_SENTENCE_TRANSFORMERS),--use-sentence-transformers)
-
-embedding-run-original-model-st: USE_SENTENCE_TRANSFORMERS=1
-embedding-run-original-model-st: embedding-run-original-model
+	@EMBEDDING_MODEL_PATH="$(EMBEDDING_MODEL_PATH)" ./scripts/embedding/run-original-model.py

 embedding-run-converted-model:
-	@./scripts/embedding/run-converted-model.sh $(CONVERTED_EMBEDDING_MODEL) \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)") \
-	$(if $(USE_POOLING),--pooling)
-
-embedding-run-converted-model-st: USE_POOLING=1
-embedding-run-converted-model-st: embedding-run-converted-model
+	@CONVERTED_EMBEDDING_MODEL="$(CONVERTED_EMBEDDING_MODEL)" ./scripts/embedding/run-converted-model.sh ${CONVERTED_EMBEDDING_MODEL}

 embedding-verify-logits: embedding-run-original-model embedding-run-converted-model
-	@./scripts/embedding/compare-embeddings-logits.sh \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
-
-embedding-verify-logits-st: embedding-run-original-model-st embedding-run-converted-model-st
-	@./scripts/embedding/compare-embeddings-logits.sh \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
+	@./scripts/embedding/compare-embeddings-logits.sh

 embedding-inspect-original-model:
 	$(call validate_embedding_model_path,embedding-inspect-original-model)
@@ -179,8 +156,7 @@ embedding-quantize-model:
 	$(call quantize_model,$(CONVERTED_EMBEDDING_MODEL),QUANTIZED_EMBEDDING_MODEL)

 embedding-run-quantized-model:
-	@./scripts/embedding/run-converted-model.sh $(QUANTIZED_EMBEDDING_MODEL) \
-	$(if $(PROMPTS_FILE),--prompts-file "$(PROMPTS_FILE)")
+	@./scripts/embedding/run-converted-model.sh ${QUANTIZED_EMBEDDING_MODEL}

 ###
 ### Perplexity targets/recipes
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -105,12 +105,12 @@ new model, the model can be converted to GGUF format using the following command
 ### Inspecting the converted model
 The converted model can be inspected using the following command:
 ```console
-(venv) $ make causal-inspect-converted-model
+(venv) $ make inspect-converted-model
 ```

 ### Running the converted model
 ```console
-(venv) $ make causal-run-converted-model
+(venv) $ make run-converted-model
 ```

 ### Model logits verfication
@@ -189,23 +189,6 @@ This command will save two files to the `data` directory, one is a binary
 file containing logits which will be used for comparison with the converted
 model, and the other is a text file which allows for manual visual inspection.

-#### Using SentenceTransformer with numbered layers
-For models that have numbered SentenceTransformer layers (01_Pooling, 02_Dense,
-03_Dense, 04_Normalize), use the `-st` targets to apply all these layers:
-
-```console
-# Run original model with SentenceTransformer (applies all numbered layers)
-(venv) $ make embedding-run-original-model-st
-
-# Run converted model with pooling enabled
-(venv) $ make embedding-run-converted-model-st
-```
-
-This will use the SentenceTransformer library to load and run the model, which
-automatically applies all the numbered layers in the correct order. This is
-particularly useful when comparing with models that should include these
-additional transformation layers beyond just the base model output.
-
 ### Model conversion
 After updates have been made to [gguf-py](../../gguf-py) to add support for the
 new model the model can be converted to GGUF format using the following command:
@@ -225,13 +208,6 @@ was done manually in the previous steps) and compare the logits:
 (venv) $ make embedding-verify-logits
 ```

-For models with SentenceTransformer layers, use the `-st` verification target:
-```console
-(venv) $ make embedding-verify-logits-st
-```
-This convenience target automatically runs both the original model with SentenceTransformer
-and the converted model with pooling enabled, then compares the results.
-
 ### llama-server verification
 To verify that the converted model works with llama-server, the following
 command can be used:
--- a/examples/model-conversion/logits.cpp
+++ b/examples/model-conversion/logits.cpp
@@ -1,7 +1,4 @@
 #include "llama.h"
-#include "common.h"
-
-
 #include <cstdio>
 #include <cstring>
 #include <string>
@@ -11,10 +8,7 @@

 static void print_usage(int, char ** argv) {
    printf("\nexample usage:\n");
-    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [-pooling] [-embd-norm <norm>] [prompt]\n", argv[0]);
-    printf("\n");
-    printf("  -embd-norm: normalization type for pooled embeddings (default: 2)\n");
-    printf("              -1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm\n");
+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers] -embd-mode [prompt]\n", argv[0]);
    printf("\n");
 }

@@ -23,8 +17,6 @@ int main(int argc, char ** argv) {
    std::string prompt = "Hello, my name is";
    int ngl = 0;
    bool embedding_mode = false;
-    bool pooling_enabled = false;
-    int32_t embd_norm = 2;  // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)

    {
        int i = 1;
@@ -49,13 +41,9 @@ int main(int argc, char ** argv) {
                    return 1;
                }
            } else if (strcmp(argv[i], "-embd-mode") == 0) {
-                embedding_mode = true;
-            } else if (strcmp(argv[i], "-pooling") == 0) {
-                pooling_enabled = true;
-            } else if (strcmp(argv[i], "-embd-norm") == 0) {
                if (i + 1 < argc) {
                    try {
-                        embd_norm = std::stoi(argv[++i]);
+                        embedding_mode = true;
                    } catch (...) {
                        print_usage(argc, argv);
                        return 1;
@@ -124,7 +112,7 @@ int main(int argc, char ** argv) {
    ctx_params.no_perf = false;
    if (embedding_mode) {
        ctx_params.embeddings = true;
-        ctx_params.pooling_type = pooling_enabled ? LLAMA_POOLING_TYPE_MEAN : LLAMA_POOLING_TYPE_NONE;
+        ctx_params.pooling_type = LLAMA_POOLING_TYPE_NONE;
        ctx_params.n_ubatch = ctx_params.n_batch;
    }

@@ -155,80 +143,35 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    float * data_ptr;
-    int data_size;
+    float * logits;
+    int n_logits;
    const char * type;
-    std::vector<float> embd_out;

    if (embedding_mode) {
-        const int n_embd = llama_model_n_embd(model);
-        const int n_embd_count = pooling_enabled ? 1 : batch.n_tokens;
-        const int n_embeddings = n_embd * n_embd_count;
-        float * embeddings;
+        logits = llama_get_embeddings(ctx);
+        n_logits = llama_model_n_embd(model) * batch.n_tokens;
        type = "-embeddings";
-
-        if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
-            embeddings = llama_get_embeddings_seq(ctx, 0);
-            embd_out.resize(n_embeddings);
-            printf("Normalizing embeddings using norm: %d\n", embd_norm);
-            common_embd_normalize(embeddings, embd_out.data(), n_embeddings, embd_norm);
-            embeddings = embd_out.data();
-        } else {
-            embeddings = llama_get_embeddings(ctx);
-        }
-
-        printf("Embedding dimension: %d\n", n_embd);
-        printf("\n");
-
-        // Print embeddings in the specified format
-        for (int j = 0; j < n_embd_count; j++) {
-            printf("embedding %d: ", j);
-
-            // Print first 3 values
-            for (int i = 0; i < 3 && i < n_embd; i++) {
-                printf("%9.6f ", embeddings[j * n_embd + i]);
-            }
-
-            printf(" ... ");
-
-            // Print last 3 values
-            for (int i = n_embd - 3; i < n_embd; i++) {
-                if (i >= 0) {
-                    printf("%9.6f ", embeddings[j * n_embd + i]);
-                }
-            }
-
-            printf("\n");
-        }
-        printf("\n");
-
-        printf("Embeddings size: %d\n", n_embeddings);
-
-        data_ptr = embeddings;
-        data_size = n_embeddings;
+        printf("Embeddings size: %d\n", n_logits);
    } else {
-        float * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
-        const int n_logits = llama_vocab_n_tokens(vocab);
+        logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
+        n_logits = llama_vocab_n_tokens(vocab);
        type = "";
        printf("Vocab size: %d\n", n_logits);
-
-        data_ptr = logits;
-        data_size = n_logits;
    }

    std::filesystem::create_directory("data");

-    // Save data to binary file
+    // Save logits to binary file
    char bin_filename[512];
    snprintf(bin_filename, sizeof(bin_filename), "data/llamacpp-%s%s.bin", model_name, type);
-    printf("Saving data to %s\n", bin_filename);
+    printf("Saving logits to %s\n", bin_filename);

    FILE * f = fopen(bin_filename, "wb");
    if (f == NULL) {
        fprintf(stderr, "%s: error: failed to open binary output file\n", __func__);
        return 1;
    }
-    fwrite(data_ptr, sizeof(float), data_size, f);
+    fwrite(logits, sizeof(float), n_logits, f);
    fclose(f);

    // Also save as text for debugging
@@ -239,27 +182,26 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: error: failed to open text output file\n", __func__);
        return 1;
    }
-    for (int i = 0; i < data_size; i++) {
-        fprintf(f, "%d: %.6f\n", i, data_ptr[i]);
+    for (int i = 0; i < n_logits; i++) {
+        fprintf(f, "%d: %.6f\n", i, logits[i]);  // Added index and changed format
    }
    fclose(f);

-    if (!embedding_mode) {
-        printf("First 10 logits: ");
-        for (int i = 0; i < 10 && i < data_size; i++) {
-            printf("%.6f ", data_ptr[i]);
-        }
-        printf("\n");
-
-        printf("Last 10 logits: ");
-        for (int i = data_size - 10; i < data_size; i++) {
-            if (i >= 0) printf("%.6f ", data_ptr[i]);
-        }
-        printf("\n\n");
+    // Print first and last 10 logits for quick verification
+    printf("First 10 logits: ");
+    for (int i = 0; i < 10 && i < n_logits; i++) {
+        printf("%.6f ", logits[i]);
    }
+    printf("\n");

-    printf("Data saved to %s\n", bin_filename);
-    printf("Data saved to %s\n", txt_filename);
+    printf("Last 10 logits: ");
+    for (int i = n_logits - 10; i < n_logits; i++) {
+        if (i >= 0) printf("%.6f ", logits[i]);
+    }
+    printf("\n\n");
+
+    printf("Logits saved to %s\n", bin_filename);
+    printf("Logits saved to %s\n", txt_filename);

    llama_free(ctx);
    llama_model_free(model);
--- a/examples/model-conversion/requirements.txt
+++ b/examples/model-conversion/requirements.txt
@@ -4,4 +4,3 @@ torchvision
 transformers
 huggingface-hub
 accelerate
-sentence-transformers
--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -48,7 +48,7 @@ def main():
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)

-    model_name = os.path.basename(model_path)
+    model_name = os.path.splitext(os.path.basename(model_path))[0]
    data_dir = Path("data")

    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -193,7 +193,7 @@ print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")

 with torch.no_grad():
-    outputs = model(input_ids.to(model.device))
+    outputs = model(input_ids)
    logits = outputs.logits

    # Extract logits for the last token (next token prediction)
--- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
@@ -2,37 +2,8 @@

 set -e

-# Parse command line arguments
-MODEL_PATH=""
-MODEL_NAME=""
-PROMPTS_FILE=""
-
-# First argument is always model path
-if [ $# -gt 0 ] && [[ "$1" != --* ]]; then
-    MODEL_PATH="$1"
-    shift
-fi
-
-# Parse remaining arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --prompts-file|-pf)
-            PROMPTS_FILE="$2"
-            shift 2
-            ;;
-        *)
-            # If MODEL_NAME not set and this isn't a flag, use as model name
-            if [ -z "$MODEL_NAME" ] && [[ "$1" != --* ]]; then
-                MODEL_NAME="$1"
-            fi
-            shift
-            ;;
-    esac
-done
-
-# Set defaults
-MODEL_PATH="${MODEL_PATH:-"$EMBEDDING_MODEL_PATH"}"
-MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
+MODEL_PATH="${1:-"$EMBEDDING_MODEL_PATH"}"
+MODEL_NAME="${2:-$(basename "$MODEL_PATH")}"

 if [ -t 0 ]; then
    CPP_EMBEDDINGS="data/llamacpp-${MODEL_NAME}-embeddings.bin"
@@ -64,18 +35,8 @@ with open('$TEMP_FILE', 'wb') as f:
    trap "rm -f $TEMP_FILE" EXIT
 fi

-# Build the semantic_check.py command
-SEMANTIC_CMD="python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
+python scripts/utils/semantic_check.py --model-path $MODEL_PATH \
    --python-embeddings data/pytorch-${MODEL_NAME}-embeddings.bin \
-    --cpp-embeddings $CPP_EMBEDDINGS"
-
-# Add prompts file if specified, otherwise use default prompt
-if [ -n "$PROMPTS_FILE" ]; then
-    SEMANTIC_CMD="$SEMANTIC_CMD --prompts-file \"$PROMPTS_FILE\""
-else
-    SEMANTIC_CMD="$SEMANTIC_CMD --prompt \"Hello world today\""
-fi
-
-# Execute the command
-eval $SEMANTIC_CMD
+    --cpp-embeddings $CPP_EMBEDDINGS \
+    --prompt "Hello world today"

--- a/examples/model-conversion/scripts/embedding/convert-model.sh
+++ b/examples/model-conversion/scripts/embedding/convert-model.sh
@@ -2,21 +2,6 @@

 set -e

-# Parse command line arguments
-SENTENCE_TRANSFORMERS=""
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -st|--sentence-transformers)
-            SENTENCE_TRANSFORMERS="--sentence-transformers-dense-modules"
-            shift
-            ;;
-        *)
-            echo "Unknown option: $1"
-            exit 1
-            ;;
-    esac
-done
-
 MODEL_NAME="${MODEL_NAME:-$(basename "$EMBEDDING_MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
@@ -30,8 +15,7 @@ echo "Converted model path:: ${CONVERTED_MODEL}"
 python ../../convert_hf_to_gguf.py --verbose \
    ${EMBEDDING_MODEL_PATH} \
    --outfile ${CONVERTED_MODEL} \
-    --outtype ${TYPE} \
-    ${SENTENCE_TRANSFORMERS}
+    --outtype ${TYPE}

 echo ""
 echo "The environment variable CONVERTED_EMBEDDING MODEL can be set to this path using:"
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@@ -2,32 +2,8 @@

 set -e

-# Parse command line arguments
-CONVERTED_MODEL=""
-PROMPTS_FILE=""
-USE_POOLING=""
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -p|--prompts-file)
-            PROMPTS_FILE="$2"
-            shift 2
-            ;;
-        --pooling)
-            USE_POOLING="1"
-            shift
-            ;;
-        *)
-            if [ -z "$CONVERTED_MODEL" ]; then
-                CONVERTED_MODEL="$1"
-            fi
-            shift
-            ;;
-    esac
-done
-
-# First try command line argument, then environment variable
-CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
+# First try command line argument, then environment variable, then file
+CONVERTED_MODEL="${1:-"$CONVERTED_EMBEDDING_MODEL"}"

 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
@@ -37,23 +13,8 @@ if [ -z "$CONVERTED_MODEL" ]; then
    exit 1
 fi

-# Read prompt from file or use default
-if [ -n "$PROMPTS_FILE" ]; then
-    if [ ! -f "$PROMPTS_FILE" ]; then
-        echo "Error: Prompts file '$PROMPTS_FILE' not found" >&2
-        exit 1
-    fi
-    PROMPT=$(cat "$PROMPTS_FILE")
-else
-    PROMPT="Hello world today"
-fi
-
 echo $CONVERTED_MODEL

 cmake --build ../../build --target llama-logits -j8
-# TODO: update logits.cpp to accept a --file/-f option for the prompt
-if [ -n "$USE_POOLING" ]; then
-    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode -pooling "$PROMPT"
-else
-    ../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "$PROMPT"
-fi
+
+../../build/bin/llama-logits -m "$CONVERTED_MODEL" -embd-mode "Hello world today"
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -13,131 +13,64 @@ unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')

 parser = argparse.ArgumentParser(description='Process model with specified path')
 parser.add_argument('--model-path', '-m', help='Path to the model')
-parser.add_argument('--prompts-file', '-p', help='Path to file containing prompts (one per line)')
-parser.add_argument('--use-sentence-transformers', action='store_true',
-                    help='Use SentenceTransformer to apply all numbered layers (01_Pooling, 02_Dense, 03_Dense, 04_Normalize)')
 args = parser.parse_args()

-def read_prompt_from_file(file_path):
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read().strip()
-    except FileNotFoundError:
-        print(f"Error: Prompts file '{file_path}' not found")
-        exit(1)
-    except Exception as e:
-        print(f"Error reading prompts file: {e}")
-        exit(1)
-
 model_path = os.environ.get('EMBEDDING_MODEL_PATH', args.model_path)
 if model_path is None:
    parser.error("Model path must be specified either via --model-path argument or EMBEDDING_MODEL_PATH environment variable")

-# Determine if we should use SentenceTransformer
-use_sentence_transformers = args.use_sentence_transformers or os.environ.get('USE_SENTENCE_TRANSFORMERS', '').lower() in ('1', 'true', 'yes')
+tokenizer = AutoTokenizer.from_pretrained(model_path)

-if use_sentence_transformers:
-    from sentence_transformers import SentenceTransformer
-    print("Using SentenceTransformer to apply all numbered layers")
-    model = SentenceTransformer(model_path)
-    tokenizer = model.tokenizer
-    config = model[0].auto_model.config  # type: ignore
+if unreleased_model_name:
+    model_name_lower = unreleased_model_name.lower()
+    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+    class_name = f"{unreleased_model_name}Model"
+    print(f"Importing unreleased model module: {unreleased_module_path}")
+
+    try:
+        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
+        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
+    except (ImportError, AttributeError) as e:
+        print(f"Failed to import or load model: {e}")
+        exit(1)
 else:
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-    config = AutoConfig.from_pretrained(model_path)
-
-    # This can be used to override the sliding window size for manual testing. This
-    # can be useful to verify the sliding window attention mask in the original model
-    # and compare it with the converted .gguf model.
-    if hasattr(config, 'sliding_window'):
-        original_sliding_window = config.sliding_window
-        #original_sliding_window = 6
-        print(f"Modified sliding window: {original_sliding_window} -> {config.sliding_window}")
-
-    print(f"Using unreleased model: {unreleased_model_name}")
-    if unreleased_model_name:
-        model_name_lower = unreleased_model_name.lower()
-        unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
-        class_name = f"{unreleased_model_name}Model"
-        print(f"Importing unreleased model module: {unreleased_module_path}")
-
-        try:
-            model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-            model = model_class.from_pretrained(model_path, config=config)
-        except (ImportError, AttributeError) as e:
-            print(f"Failed to import or load model: {e}")
-            exit(1)
-    else:
-        model = AutoModel.from_pretrained(model_path, config=config)
-    print(f"Model class: {type(model)}")
-    print(f"Model file: {type(model).__module__}")
-
-# Verify the model is using the correct sliding window
-if not use_sentence_transformers:
-    if hasattr(model.config, 'sliding_window'):  # type: ignore
-        print(f"Model's sliding_window: {model.config.sliding_window}")  # type: ignore
-    else:
-        print("Model config does not have sliding_window attribute")
+    model = AutoModel.from_pretrained(model_path)
+print(f"Model class: {type(model)}")
+#print(f"Model file: {type(model).__module__}")
+config = AutoConfig.from_pretrained(model_path)

 model_name = os.path.basename(model_path)

-if args.prompts_file:
-    prompt_text = read_prompt_from_file(args.prompts_file)
-    texts = [prompt_text]
-else:
-    texts = ["Hello world today"]
+texts = [ "Hello world today" ]
+
+encoded = tokenizer(
+    texts,
+    padding=True,
+    truncation=True,
+    return_tensors="pt"
+)
+
+tokens = encoded['input_ids'][0]
+token_strings = tokenizer.convert_ids_to_tokens(tokens)
+for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
+    print(f"{token_id:6d} -> '{token_str}'")

 with torch.no_grad():
-    if use_sentence_transformers:
-        embeddings = model.encode(texts, convert_to_numpy=True)
-        all_embeddings = embeddings  # Shape: [batch_size, hidden_size]
+    outputs = model(**encoded)
+    hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]

-        encoded = tokenizer(
-            texts,
-            padding=True,
-            truncation=True,
-            return_tensors="pt"
-        )
-        tokens = encoded['input_ids'][0]
-        token_strings = tokenizer.convert_ids_to_tokens(tokens)
-        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-            print(f"{token_id:6d} -> '{token_str}'")
+    # Extract embeddings for each token (matching LLAMA_POOLING_TYPE_NONE behavior)
+    all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]

-        print(f"Embeddings shape (after all SentenceTransformer layers): {all_embeddings.shape}")
-        print(f"Embedding dimension: {all_embeddings.shape[1] if len(all_embeddings.shape) > 1 else all_embeddings.shape[0]}")  # type: ignore
-    else:
-        # Standard approach: use base model output only
-        encoded = tokenizer(
-            texts,
-            padding=True,
-            truncation=True,
-            return_tensors="pt"
-        )
+    print(f"Hidden states shape: {hidden_states.shape}")
+    print(f"All embeddings shape: {all_embeddings.shape}")
+    print(f"Embedding dimension: {all_embeddings.shape[1]}")

-        tokens = encoded['input_ids'][0]
-        token_strings = tokenizer.convert_ids_to_tokens(tokens)
-        for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)):
-            print(f"{token_id:6d} -> '{token_str}'")
+    # Print embeddings exactly like embedding.cpp does for LLAMA_POOLING_TYPE_NONE
+    n_embd = all_embeddings.shape[1]
+    n_embd_count = all_embeddings.shape[0]

-        outputs = model(**encoded)
-        hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_size]
-
-        all_embeddings = hidden_states[0].cpu().numpy()  # Shape: [seq_len, hidden_size]
-
-        print(f"Hidden states shape: {hidden_states.shape}")
-        print(f"All embeddings shape: {all_embeddings.shape}")
-        print(f"Embedding dimension: {all_embeddings.shape[1]}")
-
-    if len(all_embeddings.shape) == 1:
-        n_embd = all_embeddings.shape[0]  # type: ignore
-        n_embd_count = 1
-        all_embeddings = all_embeddings.reshape(1, -1)
-    else:
-        n_embd = all_embeddings.shape[1]  # type: ignore
-        n_embd_count = all_embeddings.shape[0]  # type: ignore
-
-    print()
+    print()  # Empty line to match C++ output

    for j in range(n_embd_count):
        embedding = all_embeddings[j]
@@ -155,23 +88,29 @@ with torch.no_grad():

        print()  # New line

-    print()
+    print()  # Final empty line to match C++ output

    data_dir = Path("data")
    data_dir.mkdir(exist_ok=True)
    bin_filename = data_dir / f"pytorch-{model_name}-embeddings.bin"
    txt_filename = data_dir / f"pytorch-{model_name}-embeddings.txt"

+    # Save all embeddings flattened (matching what embedding.cpp would save if it did)
    flattened_embeddings = all_embeddings.flatten()
    flattened_embeddings.astype(np.float32).tofile(bin_filename)

    with open(txt_filename, "w") as f:
-        idx = 0
+        f.write(f"# Model class: {model_name}\n")
+        f.write(f"# Tokens: {token_strings}\n")
+        f.write(f"# Shape: {all_embeddings.shape}\n")
+        f.write(f"# n_embd_count: {n_embd_count}, n_embd: {n_embd}\n\n")
+
        for j in range(n_embd_count):
-            for value in all_embeddings[j]:
-                f.write(f"{idx}: {value:.6f}\n")
-                idx += 1
-    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} embeddings × {n_embd} dimensions)")
+            f.write(f"# Token {j} ({token_strings[j]}):\n")
+            for i, value in enumerate(all_embeddings[j]):
+                f.write(f"{j}_{i}: {value:.6f}\n")
+            f.write("\n")
+    print(f"Total values: {len(flattened_embeddings)} ({n_embd_count} tokens × {n_embd} dimensions)")
    print("")
    print(f"Saved bin embeddings to: {bin_filename}")
    print(f"Saved txt embeddings to: {txt_filename}")
--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@@ -67,7 +67,7 @@ def main():
    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
    args = parser.parse_args()

-    model_name = os.path.basename(args.model_path)
+    model_name = os.path.splitext(os.path.basename(args.model_path))[0]
    data_dir = Path("data")

    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@@ -40,7 +40,7 @@ if os.path.exists(index_path):
        file_path = os.path.join(model_path, file_name)
        print(f"\n--- From {file_name} ---")

-        with safe_open(file_path, framework="pt") as f:
+        with safe_open(file_path, framework="pt") as f:  # type: ignore
            for tensor_name in sorted(tensor_names):
                tensor = f.get_tensor(tensor_name)
                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
@@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
    # Single file model (original behavior)
    print("Single-file model detected")

-    with safe_open(single_file_path, framework="pt") as f:
+    with safe_open(single_file_path, framework="pt") as f:  # type: ignore
        keys = f.keys()
        print("Tensors in model:")
        for key in sorted(keys):
--- a/examples/model-conversion/scripts/utils/semantic_check.py
+++ b/examples/model-conversion/scripts/utils/semantic_check.py
@@ -35,11 +35,7 @@ def cosine_similarity(a, b=None):

 def load_embeddings_from_file(filename, n_tokens, n_embd):
    embeddings = np.fromfile(filename, dtype=np.float32)
-    # Check if this is pooled (single embedding) or per-token embeddings
-    if len(embeddings) == n_embd:
-        return embeddings.reshape(1, n_embd)
-    else:
-        return embeddings.reshape(n_tokens, n_embd)
+    return embeddings.reshape(n_tokens, n_embd)

 def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
    np.set_printoptions(suppress=True, precision=6)
@@ -52,94 +48,58 @@ def test_single_prompt_similarity(python_emb, cpp_emb, tokens, prompt):
    print(f"Embeddings shape: Python {python_emb.shape}, llama.cpp {cpp_emb.shape}")

    n_tokens = len(tokens)
-    is_pooled = python_emb.shape[0] == 1

-    if is_pooled:
-        print(f"\n[Pooled Embeddings Mode - comparing single sentence embeddings]")
-
-        # 1. Direct embedding comparison for pooled embeddings
-        print(f"\n1. Raw Embedding Magnitude Comparison:")
-        py_mag = np.linalg.norm(python_emb[0])
-        cpp_mag = np.linalg.norm(cpp_emb[0])
+    # 1. Direct embedding comparison
+    print(f"\n1. Raw Embedding Magnitude Comparison:")
+    # Check if the distance of each token embedding from the origin and compare
+    # if the vectors are on the same "sphere". This does not tell us about
+    # direction (meaning of the token embedding), just magnitude.
+    for i in range(n_tokens):
+        py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
+        cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
        ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
-        print(f"   Pooled embedding: Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
+        print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")

-        # 2. Cross-model similarity for pooled embeddings
-        print(f"\n2. Cross-Model Pooled Embedding Similarity:")
-        sim = cosine_similarity([python_emb[0]], [cpp_emb[0]])[0][0]
-        print(f"   Cosine similarity: {sim:.6f}")
+    # 2. Cosine similarity between tokens within each model
+    # Here we check the direction of token embeddings to see if the have the
+    # same meaning (similarity). This is done by calculating cosine similarity
+    # of a pair of token embeddings within each model.
+    print(f"\n2. Within-Model Token Similarities:")
+    print("   Python model:")
+    for i in range(n_tokens):
+        for j in range(i+1, n_tokens):
+            sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
+            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")

-        return {
-            'cross_model_similarities': [sim],
-            'similarity_matrix_diff': np.array([[0.0]]),
-            'max_diff': 0.0,
-            'mean_diff': 0.0,
-            'rms_diff': 0.0
-        }
-    else:
-        # Original per-token comparison logic
-        # 1. Direct embedding comparison
-        print(f"\n1. Raw Embedding Magnitude Comparison:")
-        # Check if the distance of each token embedding from the origin and compare
-        # if the vectors are on the same "sphere". This does not tell us about
-        # direction (meaning of the token embedding), just magnitude.
-        for i in range(n_tokens):
-            py_mag = np.linalg.norm(python_emb[i]) # calculate standard euclidean norm for Python embeddings
-            cpp_mag = np.linalg.norm(cpp_emb[i])   # calculate standard euclidean norm for llama.cpp embeddings
-            ratio = py_mag / cpp_mag if cpp_mag > 0 else float('inf')
-            print(f"   Token {i} ({tokens[i]}): Python={py_mag:.3f}, llama.cpp={cpp_mag:.3f}, ratio={ratio:.3f}")
+    print("   llama.cpp model:")
+    for i in range(n_tokens):
+        for j in range(i+1, n_tokens):
+            sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
+            print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")

-        # 2. Cosine similarity between tokens within each model
-        # Here we check the direction of token embeddings to see if the have the
-        # same meaning (similarity). This is done by calculating cosine similarity
-        # of a pair of token embeddings within each model.
-        print(f"\n2. Within-Model Token Similarities:")
-        print("   Python model:")
-        for i in range(n_tokens):
-            for j in range(i+1, n_tokens):
-                sim = cosine_similarity([python_emb[i]], [python_emb[j]])[0][0]
-                print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
+    # 3. Cross-model similarity (same token position)
+    print(f"\n3. Cross-Model Same-Token Similarities:")
+    for i in range(n_tokens):
+        sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
+        print(f"   Token {i} ({tokens[i]}): {sim:.4f}")

-        print("   llama.cpp model:")
-        for i in range(n_tokens):
-            for j in range(i+1, n_tokens):
-                sim = cosine_similarity([cpp_emb[i]], [cpp_emb[j]])[0][0]
-                print(f"     {tokens[i]} ↔ {tokens[j]}: {sim:.4f}")
+    # 4. Similarity matrix comparison
+    print(f"\n4. Similarity Matrix Differences:")
+    py_sim_matrix = cosine_similarity(python_emb)
+    cpp_sim_matrix = cosine_similarity(cpp_emb)
+    diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)

-        # 3. Cross-model similarity (same token position)
-        print(f"\n3. Cross-Model Same-Token Similarities:")
-        for i in range(n_tokens):
-            sim = cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0]
-            print(f"   Token {i} ({tokens[i]}): {sim:.4f}")
+    print(f"   Max difference: {np.max(diff_matrix):.4f}")
+    print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
+    print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")

-        # 4. Similarity matrix comparison
-        print(f"\n4. Similarity Matrix Differences:")
-        py_sim_matrix = cosine_similarity(python_emb)
-        cpp_sim_matrix = cosine_similarity(cpp_emb)
-        diff_matrix = np.abs(py_sim_matrix - cpp_sim_matrix)
-
-        print(f"   Max difference: {np.max(diff_matrix):.4f}")
-        print(f"   Mean difference: {np.mean(diff_matrix):.4f}")
-        print(f"   RMS difference: {np.sqrt(np.mean(diff_matrix**2)):.4f}")
-
-        return {
-            'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
-            'similarity_matrix_diff': diff_matrix,
-            'max_diff': np.max(diff_matrix),
-            'mean_diff': np.mean(diff_matrix),
-            'rms_diff': np.sqrt(np.mean(diff_matrix**2))
-        }
-
-def read_prompt_from_file(file_path):
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read().strip()
-    except FileNotFoundError:
-        print(f"Error: Prompts file '{file_path}' not found")
-        exit(1)
-    except Exception as e:
-        print(f"Error reading prompts file: {e}")
-        exit(1)
+    return {
+        'cross_model_similarities': [cosine_similarity([python_emb[i]], [cpp_emb[i]])[0][0] for i in range(n_tokens)],
+        'similarity_matrix_diff': diff_matrix,
+        'max_diff': np.max(diff_matrix),
+        'mean_diff': np.mean(diff_matrix),
+        'rms_diff': np.sqrt(np.mean(diff_matrix**2))
+    }

 def main():
    parser = argparse.ArgumentParser(description='Test semantic similarity between Python and llama.cpp embeddings')
@@ -148,20 +108,14 @@ def main():
    parser.add_argument('--cpp-embeddings', '-ce', help='Path to llama.cpp embeddings "logits" binary file')
    parser.add_argument('--causal', '-c', default=False, help='if the model is causal (default: false)', action='store_true')
    parser.add_argument('--prompt', '-p', default='Hello world today', help='Test prompt')
-    parser.add_argument('--prompts-file', '-pf', help='Path to file containing prompts')

    args = parser.parse_args()

-    if args.prompts_file:
-        prompt = read_prompt_from_file(args.prompts_file)
-    else:
-        prompt = args.prompt
-
    print("Semantic Similarity Test Between Python and llama.cpp Embedding Models")
    print("=" * 70)

    # Single prompt detailed comparison
-    print(f"\nTesting with prompt: '{prompt}'")
+    print(f"\nTesting with prompt: '{args.prompt}'")

    # Load the python model to get configuration information and also to load the tokenizer.
    print("Loading model and tokenizer using AutoTokenizer:", args.model_path)
@@ -190,7 +144,7 @@ def main():
        else:
            model = AutoModel.from_pretrained(args.model_path)

-    encoded = tokenizer(prompt, return_tensors="pt")
+    encoded = tokenizer(args.prompt, return_tensors="pt")
    tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    n_tokens = len(tokens)
    print(f"n_tokens: {n_tokens}");
@@ -201,7 +155,7 @@ def main():
    python_embeddings = load_embeddings_from_file(args.python_embeddings, n_tokens, model.config.hidden_size)

    # Run comparison
-    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, prompt)
+    results = test_single_prompt_similarity(python_embeddings, llamacpp_embeddings, tokens, args.prompt)

    # Summary
    print(f"\n=== SUMMARY ===")
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -145,20 +145,6 @@ int main(int argc, char ** argv) {

    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());

-    if (llama_model_has_encoder(model)) {
-        if (llama_encode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return 1;
-        }
-
-        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
-        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
-            decoder_start_token_id = llama_vocab_bos(vocab);
-        }
-
-        batch = llama_batch_get_one(&decoder_start_token_id, 1);
-    }
-
    // main loop

    const auto t_main_start = ggml_time_us();
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -1,40 +1,5 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
 project("ggml" C CXX ASM)
-
-### GGML Version
-set(GGML_VERSION_MAJOR 0)
-set(GGML_VERSION_MINOR 9)
-set(GGML_VERSION_PATCH 4)
-set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
-
-find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
-if(GIT_EXE)
-    # Get current git commit hash
-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE GGML_BUILD_COMMIT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_QUIET
-    )
-
-    # Check if the working directory is dirty (i.e., has uncommitted changes)
-    execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        RESULT_VARIABLE GGML_GIT_DIRTY
-        ERROR_QUIET
-    )
-endif()
-
-# Build the version string with optional dirty flag
-set(GGML_VERSION "${GGML_VERSION_BASE}")
-if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
-    set(GGML_VERSION "${GGML_VERSION}-dirty")
-endif()
-
-if(NOT GGML_BUILD_COMMIT)
-    set(GGML_BUILD_COMMIT "unknown")
-endif()
-
 include(CheckIncludeFileCXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -176,7 +141,7 @@ set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")


 if (MINGW)
-    set(GGML_WIN_VER "0xA00" CACHE STRING   "ggml: Windows version")
+    set(GGML_WIN_VER "0x602" CACHE STRING   "ggml: Windows version")
 endif()

 # ggml core
@@ -209,6 +174,7 @@ option(GGML_HIP                             "ggml: use HIP"
 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
+option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF)
 option(GGML_HIP_MMQ_MFMA                    "ggml: enable MFMA MMA for CDNA in MMQ"           ON)
 option(GGML_HIP_EXPORT_METRICS              "ggml: enable kernel perf metrics output"         OFF)
 option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
@@ -222,11 +188,9 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
-option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
-option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
-
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
+option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
@@ -337,6 +301,26 @@ endif()
 # Create CMake package
 #

+# Generate version info based on git commit.
+
+if(NOT DEFINED GGML_BUILD_NUMBER)
+    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
+    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE GGML_BUILD_NUMBER
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+
+    if(GGML_BUILD_NUMBER EQUAL 1)
+        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
+    endif()
+
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE GGML_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+endif()


 # Capture variables prefixed with GGML_.
@@ -365,7 +349,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})

 # Create the CMake package and set install location.

-set(GGML_INSTALL_VERSION ${GGML_VERSION})
+set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
 set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
 set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -132,8 +132,6 @@ extern "C" {
        GGML_BACKEND_DEVICE_TYPE_CPU,
        // GPU device using dedicated memory
        GGML_BACKEND_DEVICE_TYPE_GPU,
-        // integrated GPU device using host memory
-        GGML_BACKEND_DEVICE_TYPE_IGPU,
        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
        GGML_BACKEND_DEVICE_TYPE_ACCEL
    };
@@ -152,21 +150,11 @@ extern "C" {

    // all the device properties
    struct ggml_backend_dev_props {
-        // device name
        const char * name;
-        // device description
        const char * description;
-        // device free memory in bytes
        size_t memory_free;
-        // device total memory in bytes
        size_t memory_total;
-        // device type
        enum ggml_backend_dev_type type;
-        // device id
-        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
-        //   if the id is unknown, this should be NULL
-        const char * device_id;
-        // device capabilities
        struct ggml_backend_dev_caps caps;
    };

@@ -215,8 +203,6 @@ extern "C" {
    // Backend registry
    //

-    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
-
    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);

    // Backend (reg) enumeration
@@ -316,8 +302,7 @@ extern "C" {
    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);

-    GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API size_t                     ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -39,13 +39,14 @@ extern "C" {
 // user-code should use only these functions
 //

-// TODO: remove in the future
 GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);

 GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);

 GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);

+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -7,25 +7,26 @@
 extern "C" {
 #endif

-#define RPC_PROTO_MAJOR_VERSION    3
+#define RPC_PROTO_MAJOR_VERSION    2
 #define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

 // backend API
-GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
 GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);

-GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
-                                                    size_t n_threads, size_t n_devices,
-                                                    ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
+                                                    const char * cache_dir,
+                                                    size_t free_mem, size_t total_mem);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
+
+GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);

 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@@ -7,8 +7,7 @@
 extern "C" {
 #endif

-// device buffer
-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -237,8 +237,6 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

-// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726
-#define GGML_ROPE_TYPE_NORMAL 0
 #define GGML_ROPE_TYPE_NEOX   2
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24
@@ -286,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
 //    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
 //
 #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
+    const type prefix##0 = (pointer)->array[0]; \
    GGML_UNUSED(prefix##0);
 #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
+    const type prefix##1 = (pointer)->array[1]; \
    GGML_UNUSED(prefix##1);
 #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
+    const type prefix##2 = (pointer)->array[2]; \
    GGML_UNUSED(prefix##2);
 #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
+    const type prefix##3 = (pointer)->array[3]; \
    GGML_UNUSED(prefix##3);

 #define GGML_TENSOR_UNARY_OP_LOCALS \
@@ -576,7 +574,6 @@ extern "C" {
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
        GGML_UNARY_OP_GELU_ERF,
-        GGML_UNARY_OP_XIELU,

        GGML_UNARY_OP_COUNT,
    };
@@ -1151,18 +1148,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // xIELU activation function
-    // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
-    // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions
-    // that constrain the positive and negative source alpha values respectively
-    GGML_API struct ggml_tensor * ggml_xielu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float alpha_n,
-            float alpha_p,
-            float beta,
-            float eps);
-
    // gated linear unit ops
    // A: n columns, r rows,
    // result is n / 2 columns, r rows,
@@ -1630,13 +1615,6 @@ extern "C" {
            float                 scale,
            float                 max_bias);

-    GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * mask,
-            float                 scale,
-            float                 max_bias);
-
    GGML_API void ggml_soft_max_add_sinks(
            struct ggml_tensor * a,
            struct ggml_tensor * sinks);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -114,9 +114,6 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")

 if (NOT MSVC)
    if (GGML_STATIC)
-        if (UNIX AND NOT APPLE)
-            set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
-        endif()
        add_link_options(-static)
        if (MINGW)
            add_link_options(-static-libgcc -static-libstdc++)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 }

 // ops that return true for this function must not use restrict pointers for their backend implementations
-bool ggml_op_can_inplace(enum ggml_op op) {
+static bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
@@ -95,104 +95,39 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te

 // dynamic tensor allocator

-#define GGML_VBUFFER_MAX_CHUNKS 16
-
-// relative memory address within an allocation that can be split into multiple buffers (chunks)
-struct buffer_address {
-    int chunk;     // index of a backend buffer
-    size_t offset; // local memory offset within the buffer
-};
-
-static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
-
-static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
-    return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
-}
-
 struct free_block {
    size_t offset;
    size_t size;
 };

-struct tallocr_chunk {
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-    int n_free_blocks;
-    size_t max_size;
-};
-
 struct ggml_dyn_tallocr {
    size_t alignment;
-    size_t max_chunk_size;
-    struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
-    int n_chunks;
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    size_t max_size;

 #ifdef GGML_ALLOCATOR_DEBUG
    struct {
        const struct ggml_tensor * tensor;
-        struct buffer_address addr;
+        size_t offset;
    } allocated_tensors[1024];
 #endif
 };

-static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
-    GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = 0;
-    while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
-        insert_pos++;
-    }
-    // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
-        chunk->free_blocks[i] = chunk->free_blocks[i-1];
-    }
-    // insert the new block
-    chunk->free_blocks[insert_pos].offset = offset;
-    chunk->free_blocks[insert_pos].size = size;
-    chunk->n_free_blocks++;
-}
-
-static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
-    // shift all elements after idx by 1 to the left, overwriting the element at idx
-    for (int i = idx; i < chunk->n_free_blocks; i++) {
-        chunk->free_blocks[i] = chunk->free_blocks[i+1];
-    }
-    chunk->n_free_blocks--;
-}
-
-static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
-    if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
-        return -1;
-    }
-    struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
-    chunk->n_free_blocks = 1;
-    chunk->free_blocks[0].offset = 0;
-    // available space in a chunk is limited to max_chunk_size, but can be higher if:
-    // 1. a single tensor exceeds the maximum, and cannot fit any other way
-    // 2. we are running out of chunks
-    // backends will either manage to allocate the larger size, or report an error.
-    chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
-    if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
-        chunk->free_blocks[0].size = SIZE_MAX/2;
-    }
-    alloc->chunks[alloc->n_chunks] = chunk;
-    alloc->n_chunks++;
-    return alloc->n_chunks - 1;
-}
-
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i].tensor == NULL) {
            alloc->allocated_tensors[i].tensor = tensor;
-            alloc->allocated_tensors[i].addr = addr;
+            alloc->allocated_tensors[i].offset = offset;
            return;
        }
    }
    GGML_ABORT("out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
+        if (alloc->allocated_tensors[i].offset == offset) {
            alloc->allocated_tensors[i].tensor = NULL;
            return;
        }
@@ -201,94 +136,76 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buff
 }
 #endif

-static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
+static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);

-    int best_fit_chunk = -1;
-    int best_fit_block = -1;
    size_t max_avail = 0;

-    // find the best fitting free block besides the last block, within any chunk
-    for (int c = 0; c < alloc->n_chunks; ++c) {
-        struct tallocr_chunk * chunk = alloc->chunks[c];
-        size_t best_fit_size = SIZE_MAX;
-        for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
-            struct free_block * block = &chunk->free_blocks[i];
-            max_avail = MAX(max_avail, block->size);
-            if (block->size >= size && block->size <= best_fit_size) {
-                best_fit_chunk = c;
-                best_fit_block = i;
-                best_fit_size = block->size;
-            }
+    // find the best fitting free block besides the last block
+    int best_fit_block = -1;
+    size_t best_fit_size = SIZE_MAX;
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size && block->size <= best_fit_size) {
+            best_fit_block = i;
+            best_fit_size = block->size;
        }
    }

    if (best_fit_block == -1) {
-        // no suitable block found, try the last block (this will grow a chunks size)
-        for (int c = 0; c < alloc->n_chunks; ++c) {
-            struct tallocr_chunk * chunk = alloc->chunks[c];
-            if (chunk->n_free_blocks > 0) {
-                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
-                max_avail = MAX(max_avail, block->size);
-                if (block->size >= size) {
-                    best_fit_chunk = c;
-                    best_fit_block = chunk->n_free_blocks - 1;
-                    break;
-                }
-            }
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+        } else {
+            // this should never happen
+            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+                    __func__, size, max_avail);
+            GGML_ABORT("not enough space in the buffer");
        }
    }

-    if (best_fit_block == -1) {
-        // none of the existing chunks have enough space left
-        best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
-        best_fit_block = 0;
-    }
-    if (best_fit_chunk == -1) {
-        // since the last chunk always has virtually endless memory, this should never happen
-        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
-            __func__, size, max_avail);
-        GGML_ABORT("graph allocation: failed to reserve memory");
-    }
-
-    struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
-    struct free_block    * block = &chunk->free_blocks[best_fit_block];
-    struct buffer_address  addr  = {.chunk = best_fit_chunk, .offset = block->offset };
-    block->offset += size;
+    struct free_block * block = &alloc->free_blocks[best_fit_block];
+    size_t offset = block->offset;
+    block->offset = offset + size;
    block->size -= size;
    if (block->size == 0) {
        // remove block if empty
-        ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
+        alloc->n_free_blocks--;
+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
+        }
    }

-    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
+    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);

 #ifdef GGML_ALLOCATOR_DEBUG
-    add_allocated_tensor(alloc, addr, tensor);
-    size_t cur_max = addr.offset + size;
-    if (cur_max > alloc->max_size[addr.chunk]) {
-        // sort allocated_tensors by chunk/offset
+    add_allocated_tensor(alloc, offset, tensor);
+    size_t cur_max = offset + size;
+    if (cur_max > alloc->max_size) {
+        // sort allocated_tensors by offset
        for (int i = 0; i < 1024; i++) {
            for (int j = i + 1; j < 1024; j++) {
-                if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
+                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
-                    struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
+                    size_t tmp_offset = alloc->allocated_tensors[i].offset;
                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
-                    alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
+                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
                    alloc->allocated_tensors[j].tensor = tmp_tensor;
-                    alloc->allocated_tensors[j].addr = tmp_addr;
+                    alloc->allocated_tensors[j].offset = tmp_offset;
                }
            }
        }
-        GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
        for (int i = 0; i < 1024; i++) {
            if (alloc->allocated_tensors[i].tensor) {
-                GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
-                    alloc->allocated_tensors[i].addr.chunk,
-                    alloc->allocated_tensors[i].addr.offset,
-                    alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
+                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                    alloc->allocated_tensors[i].offset,
+                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
            }
        }
@@ -296,69 +213,78 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
    }
 #endif

-    chunk->max_size = MAX(chunk->max_size, addr.offset + size);
+    alloc->max_size = MAX(alloc->max_size, offset + size);

-    return addr;
+    return offset;

    GGML_UNUSED(tensor);
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

-    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
-        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);

 #ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, addr, tensor);
+    remove_allocated_tensor(alloc, offset, tensor);
 #endif

-    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
-
    // see if we can merge with an existing block
-    for (int i = 0; i < chunk->n_free_blocks; i++) {
-        struct free_block * block = &chunk->free_blocks[i];
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
        // check if ptr is at the end of the block
-        if (block->offset + block->size == addr.offset) {
+        if (block->offset + block->size == offset) {
            block->size += size;
            // check if we can merge with the next block
-            if (i < chunk->n_free_blocks - 1) {
-                struct free_block * next = &chunk->free_blocks[i+1];
-                if (block->offset + block->size == next->offset) {
-                    block->size += next->size;
-                    ggml_dyn_tallocr_remove_block(chunk, i+1);
+            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
+                block->size += alloc->free_blocks[i+1].size;
+                alloc->n_free_blocks--;
+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
                }
            }
            return;
        }
        // check if ptr is at the beginning of the block
-        if (addr.offset + size == block->offset) {
-            block->offset = addr.offset;
+        if (offset + size == block->offset) {
+            block->offset = offset;
            block->size += size;
            // check if we can merge with the previous block
-            if (i > 0) {
-                struct free_block * prev = &chunk->free_blocks[i-1];
-                if (prev->offset + prev->size == block->offset) {
-                    prev->size += block->size;
-                    ggml_dyn_tallocr_remove_block(chunk, i);
+            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
+                alloc->free_blocks[i-1].size += block->size;
+                alloc->n_free_blocks--;
+                for (int j = i; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
                }
            }
            return;
        }
    }
    // otherwise, add a new block
-    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].offset = offset;
+    alloc->free_blocks[insert_pos].size = size;
+    alloc->n_free_blocks++;

    GGML_UNUSED(tensor);
 }

 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
-    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
-        free(alloc->chunks[i]);
-        alloc->chunks[i] = NULL;
-    }
-    alloc->n_chunks = 0;
+    alloc->n_free_blocks = 1;
+    alloc->free_blocks[0].offset = 0;
+    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
+    alloc->max_size = 0;

 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
@@ -367,14 +293,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 #endif
 }

-static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));

    *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment      = */ alignment,
-        /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
-        /*.chunks         = */ {NULL},
-        /*.n_chunks       = */ 0,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.max_size      = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ {{0}},
 #endif
@@ -386,73 +312,11 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t m
 }

 static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
-    for (int i = 0; i < alloc->n_chunks; ++i) {
-        free(alloc->chunks[i]);
-    }
    free(alloc);
 }

-static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) {
-    return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0;
-}
-
-
-// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
-
-struct vbuffer {
-    ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
-};
-
-static void ggml_vbuffer_free(struct vbuffer * buf) {
-    if (buf == NULL) {
-        return;
-    }
-    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
-        ggml_backend_buffer_free(buf->chunks[i]);
-    }
-    free(buf);
-}
-
-static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) {
-    return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0;
-}
-
-static size_t ggml_vbuffer_size(struct vbuffer * buf) {
-    size_t size = 0;
-    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
-        size += ggml_backend_buffer_get_size(buf->chunks[i]);
-    }
-    return size;
-}
-
-static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
-    struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
-    if (buf == NULL) {
-        return NULL;
-    }
-
-    for (int n = 0; n < talloc->n_chunks; n++) {
-        size_t chunk_size = talloc->chunks[n]->max_size;
-        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
-        if (buf->chunks[n] == NULL) {
-            ggml_vbuffer_free(buf);
-            return NULL;
-        }
-        ggml_backend_buffer_set_usage(buf->chunks[n], usage);
-    }
-    return buf;
-}
-
-static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
-    void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
-    void * addr = (char *)base + buf_addr.offset;
-    ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
-}
-
-static void ggml_vbuffer_reset(struct vbuffer * buf) {
-    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
-        ggml_backend_buffer_reset(buf->chunks[i]);
-    }
+static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
+    return alloc->max_size;
 }


@@ -464,13 +328,13 @@ struct hash_node {
    int n_children;
    int n_views;
    int buffer_id;
-    struct buffer_address addr;
+    size_t offset; // offset within the buffer
    bool allocated;
 };

 struct tensor_alloc {
    int buffer_id;
-    struct buffer_address addr;
+    size_t offset;
    size_t size_max; // 0 = pre-allocated, unused, or view
 };

@@ -485,7 +349,7 @@ struct node_alloc {

 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
-    struct vbuffer ** buffers; // [n_buffers]
+    ggml_backend_buffer_t * buffers; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -506,7 +370,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
    GGML_ASSERT(galloc->bufts != NULL);

-    galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
+    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);

    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -526,8 +390,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs

        if (galloc->buf_tallocs[i] == NULL) {
            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
-            size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
-            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
+            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
        }
    }
    galloc->n_buffers = n_bufs;
@@ -555,7 +418,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
                }
            }
            if (!freed) {
-                ggml_vbuffer_free(galloc->buffers[i]);
+                ggml_backend_buffer_free(galloc->buffers[i]);
            }
        }
        if (galloc->buf_tallocs != NULL) {
@@ -604,7 +467,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor

    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
        hn->allocated = true;
-        assert(hn->addr.offset == 0);
+        assert(hn->offset == 0);

        // try to reuse a parent's buffer (inplace)
        if (ggml_op_can_inplace(node->op)) {
@@ -638,9 +501,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                            assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
+                            assert(view_src_hn->offset == p_hn->offset);
                            hn->buffer_id = p_hn->buffer_id;
-                            hn->addr = p_hn->addr;
+                            hn->offset = p_hn->offset;
                            p_hn->allocated = false; // avoid freeing the parent
                            view_src_hn->allocated = false;
                            return;
@@ -648,7 +511,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                    } else {
                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                        hn->buffer_id = p_hn->buffer_id;
-                        hn->addr = p_hn->addr;
+                        hn->offset = p_hn->offset;
                        p_hn->allocated = false; // avoid freeing the parent
                        return;
                    }
@@ -659,8 +522,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
+        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
        hn->buffer_id = buffer_id;
-        hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
+        hn->offset = offset;
    }
 }

@@ -672,11 +536,12 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    }

    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+    size_t offset = hn->offset;
    int buffer_id = hn->buffer_id;
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
+    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
    hn->allocated = false;
 }

@@ -827,24 +692,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
        if (node->view_src || node->data) {
            node_alloc->dst.buffer_id = -1;
-            node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
+            node_alloc->dst.offset = SIZE_MAX;
            node_alloc->dst.size_max = 0;
        } else {
            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
            node_alloc->dst.buffer_id = hn->buffer_id;
-            node_alloc->dst.addr = hn->addr;
+            node_alloc->dst.offset    = hn->offset;
            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
        }
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (!src || src->view_src || src->data) {
                node_alloc->src[j].buffer_id = -1;
-                node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
+                node_alloc->src[j].offset = SIZE_MAX;
                node_alloc->src[j].size_max = 0;
            } else {
                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
                node_alloc->src[j].buffer_id = hn->buffer_id;
-                node_alloc->src[j].addr = hn->addr;
+                node_alloc->src[j].offset   = hn->offset;
                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
            }
        }
@@ -860,11 +725,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        if (leaf->view_src || leaf->data) {
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
-            galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
+            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
            galloc->leaf_allocs[i].leaf.size_max = 0;
        } else {
            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
-            galloc->leaf_allocs[i].leaf.addr = hn->addr;
+            galloc->leaf_allocs[i].leaf.offset = hn->offset;
            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
        }
    }
@@ -879,29 +744,22 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            }
        }

+        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
+        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
+
        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
-        bool realloc = galloc->buffers[i] == NULL;
-        size_t new_size = 0;
-        for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) {
-            size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0;
-            size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c);
-            new_size += new_chunk_size;
-            if (new_chunk_size > cur_chunk_size) {
-                realloc = true;
-            }
-        }
-        if (realloc) {
+        if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
-            size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

-            ggml_vbuffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            ggml_backend_buffer_free(galloc->buffers[i]);
+            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
            if (galloc->buffers[i] == NULL) {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
+            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }

@@ -914,11 +772,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {

 static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
    int buffer_id = tensor_alloc->buffer_id;
-    assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
+    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);

    if (tensor->view_src != NULL) {
        if (tensor->buffer == NULL) {
-            assert(tensor_alloc->addr.offset == SIZE_MAX);
+            assert(tensor_alloc->offset == SIZE_MAX);
            if (tensor->view_src->buffer == NULL) {
                // this tensor was allocated without ggml-backend
                return;
@@ -927,9 +785,11 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
        }
    } else {
        if (tensor->data == NULL) {
-            assert(tensor_alloc->addr.offset != SIZE_MAX);
-            assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
-            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
+            assert(tensor_alloc->offset != SIZE_MAX);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
+            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
+            void * addr = (char *)base + tensor_alloc->offset;
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
        } else {
            if (tensor->buffer == NULL) {
                // this tensor was allocated without ggml-backend
@@ -1014,7 +874,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    // reset buffers
    for (int i = 0; i < galloc->n_buffers; i++) {
        if (galloc->buffers[i] != NULL) {
-            ggml_vbuffer_reset(galloc->buffers[i]);
+            ggml_backend_buffer_reset(galloc->buffers[i]);
        }
    }

@@ -1057,7 +917,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
        }
    }

-    return ggml_vbuffer_size(galloc->buffers[buffer_id]);
+    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }

 // utils
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -8,7 +8,7 @@
 extern "C" {
 #endif

-    #define GGML_BACKEND_API_VERSION 2
+    #define GGML_BACKEND_API_VERSION 1

    //
    // Backend buffer type
@@ -116,7 +116,7 @@ extern "C" {
        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);

        // (optional) sort/optimize the nodes in the graph
-        void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*optimize_graph)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    };

    struct ggml_backend {
@@ -209,6 +209,9 @@ extern "C" {
        void * context;
    };

+    // Internal backend registry API
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
+
    // Add backend dynamic loading support to the backend

    // Initialize the backend
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -135,10 +135,6 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
    return p;
 }

-static const char * dl_error() {
-    return "";
-}
-
 #else

 using dl_handle = void;
@@ -159,11 +155,6 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
    return dlsym(handle, name);
 }

-static const char * dl_error() {
-    const char *rslt = dlerror();
-    return rslt != nullptr ? rslt : "";
-}
-
 #endif

 using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
@@ -249,7 +240,7 @@ struct ggml_backend_registry {
        dl_handle_ptr handle { dl_load_library(path) };
        if (!handle) {
            if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
            }
            return nullptr;
        }
@@ -409,8 +400,9 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const

 ggml_backend_t ggml_backend_init_best(void) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
-    dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
-    dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    if (!dev) {
+        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    }
    if (!dev) {
        return nullptr;
    }
@@ -539,7 +531,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
                if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
                    dl_handle_ptr handle { dl_load_library(entry) };
                    if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
                    }
                    if (handle) {
                        auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -463,10 +463,10 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
    backend->iface.event_wait(backend, event);
 }

-static void ggml_backend_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    GGML_ASSERT(backend);
-    if (backend->iface.graph_optimize != NULL) {
-        backend->iface.graph_optimize(backend, cgraph);
+    if (backend->iface.optimize_graph != NULL) {
+        backend->iface.optimize_graph(backend, cgraph);
    }
 }

@@ -1307,7 +1307,7 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra

        // Optimize this split of the graph. This needs to happen before we make graph_copy,
        // so they are in sync.
-        ggml_backend_graph_optimize(sched->backends[split->backend_id], &split->graph);
+        ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);

        // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
        for (int j = 0; j < split->n_inputs; j++) {
@@ -1793,14 +1793,6 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
    return sched->backends[i];
 }

-ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
-    GGML_ASSERT(sched);
-    int backend_index = ggml_backend_sched_backend_id(sched, backend);
-    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-
-    return sched->bufts[backend_index];
-}
-
 size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
    GGML_ASSERT(sched);
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
--- a/ggml/src/ggml-blas/CMakeLists.txt
+++ b/ggml/src/ggml-blas/CMakeLists.txt
@@ -74,7 +74,7 @@ if (BLAS_FOUND)

    target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})

-    if ("${BLAS_INCLUDE_DIRS}" MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+    if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
        add_compile_definitions(GGML_BLAS_USE_MKL)
    endif()

--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -270,7 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
    /* .graph_compute           = */ ggml_backend_blas_graph_compute,
    /* .event_record            = */ NULL,
    /* .event_wait              = */ NULL,
-    /* .graph_optimize          = */ NULL,
+    /* .optimize_graph          = */ NULL,
 };

 static ggml_guid_t ggml_backend_blas_guid(void) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	3f62ee8bee	metal : back to a single queue per device ggml-ci	2025-09-09 17:06:46 +03:00
Georgi Gerganov	0926cb492d	metal : clean-up loose ends, ready for tests ggml-ci	2025-09-09 15:01:21 +03:00
Georgi Gerganov	f288225d42	metal : remove broken implementation of GGML_OP_SET ggml-ci	2025-09-09 14:29:54 +03:00
Georgi Gerganov	7fc2b3d503	metal : restore .alloc_buffer for buffer_from_ptr_type ggml-ci	2025-09-09 14:28:34 +03:00
Georgi Gerganov	85aaf52b7e	metal : create only metal buffers, no wrapping of host memory ggml-ci	2025-09-09 14:01:15 +03:00
Georgi Gerganov	d91ba85d04	metal : remove deprecated ggml_backend_metal_buffer_from_ptr	2025-09-09 09:29:11 +03:00
Georgi Gerganov	bdff7729b1	metal : fix batch size for MUL_MAT_ID	2025-09-09 09:29:11 +03:00
Georgi Gerganov	c5637cf39c	cont : add comments, extend op offload, clean up ggml-ci	2025-09-09 09:29:11 +03:00
Georgi Gerganov	97b96c1ad3	metal : make the backend async ggml-ci	2025-09-09 09:29:10 +03:00