ggml : split graph allocations according to backend max buffer size (#15815 )

* ggml : make gallocr respect the backend's max buffer size * if the graph requires more memory than can fit into a single allocation, split it into multiple backend buffers * vulkan: report the actual max allocation size in buffer type interface * fix missing newline, apple-clang warning * track size of individual chunks in ggml_dyn_tallocr and raise max chunks. revert to use suballocation_block_size as max chunk size for vulkan. * track (chunk, offset) pairs instead of "global" offsets through gallocr. * simpler, don't need loops to map between local/global offsets * touches more code * fix dyn_tallocr_max_size and initialization * fix memory leak when buffers are reused due to same buffer type appearing multiple times * make vbuffer allocation follow the same logic as backend_buffer did before * continue to use leftover unallocated space of previous chunks after a new one has been created * treat free blocks of each chunk as separate list * they're still allocated together, but start/end of each chunk is tracked, and allocate/free iterate over sub-ranges * exhaust freed blocks of all chunks before considering their last blocks with unallocated space * start with 0 chunks/blocks and create chunks as needed * allow the last chunk to grow beyond max size * refactor: move adding new free block and new chunk into separate functions * allocate chunks individually with a separate free-blocks list for each one * needs a bit more memory/allocations/indirections, but code is simpler * fix warnings (missing static) & debug checks
model : add label for LiquidAI LFM2-2.6B model (#16204 )
2026-02-05 13:53:23 +02:00 · 2025-09-24 16:17:49 +02:00 · 2025-09-24 13:42:26 +02:00 · 2025-09-24 10:25:26 +02:00 · 2025-09-24 09:53:47 +03:00 · 2025-09-24 08:53:20 +02:00
662 changed files with 63842 additions and 32942 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -22,7 +22,14 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
-BinPackArguments: false
+# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
+AttributeMacros:
+  - __host__
+  - __device__
+  - __global__
+  - __forceinline__
+  - __launch_bounds__
+BinPackArguments: true
 BinPackParameters: false # OnePerLine
 BitFieldColonSpacing: Both
 BreakBeforeBraces: Custom # Attach
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -17,6 +17,7 @@ Checks: >
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
+    -performance-enum-size,
    portability-*,
    -portability-simd-intrinsics,
    misc-*,
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -4,7 +4,7 @@ ARG UBUNTU_VERSION=24.04
 ARG ROCM_VERSION=6.4
 ARG AMDGPU_VERSION=6.4

-# Target the CUDA build image
+# Target the ROCm build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

 ### Build image
@@ -15,16 +15,13 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
-#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
+#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html

-ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
-#ARG ROCM_DOCKER_ARCH=gfx1100
+ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
+#ARG ROCM_DOCKER_ARCH='gfx1151'

-# Set nvcc architectured
+# Set ROCm architectures
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-# ENV CC=/opt/rocm/llvm/bin/clang
-# ENV CXX=/opt/rocm/llvm/bin/clang++

 RUN apt-get update \
    && apt-get install -y \
@@ -39,8 +36,16 @@ WORKDIR /app

 COPY . .

+RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1
+
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
+    cmake -S . -B build \
+        -DGGML_HIP=ON \
+        -DGGML_HIP_ROCWMMA_FATTN=ON \
+        -DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \
+        -DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
+        -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
+        -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
    && cmake --build build --config Release -j$(nproc)

 RUN mkdir -p /app/lib \
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -0,0 +1,122 @@
+ARG GCC_VERSION=15.2.0
+ARG UBUNTU_VERSION=24.04
+
+### Build Llama.cpp stage
+FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt upgrade -y && \
+    apt install -y --no-install-recommends \
+        git cmake ccache ninja-build \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        libopenblas-dev libcurl4-openssl-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY . .
+
+RUN --mount=type=cache,target=/root/.ccache \
+    --mount=type=cache,target=/app/build \
+    cmake -S . -B build -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_BACKEND_DL=OFF \
+        -DGGML_NATIVE=OFF \
+        -DGGML_BLAS=ON \
+        -DGGML_BLAS_VENDOR=OpenBLAS && \
+    cmake --build build --config Release -j $(nproc) && \
+    cmake --install build --prefix /opt/llama.cpp
+
+COPY *.py             /opt/llama.cpp/bin
+COPY .devops/tools.sh /opt/llama.cpp/bin
+
+COPY gguf-py          /opt/llama.cpp/gguf-py
+COPY requirements.txt /opt/llama.cpp/gguf-py
+COPY requirements     /opt/llama.cpp/gguf-py/requirements
+
+
+### Collect all llama.cpp binaries, libraries and distro libraries
+FROM --platform=linux/s390x scratch AS collector
+
+# Copy llama.cpp binaries and libraries
+COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
+COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
+COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
+
+
+### Base image
+FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt install -y --no-install-recommends \
+        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
+        curl libgomp1 libopenblas-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+# Copy llama.cpp libraries
+COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
+
+
+### Full
+FROM --platform=linux/s390x base AS full
+
+ENV PATH="/root/.cargo/bin:${PATH}"
+WORKDIR /app
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt/lists \
+    apt update -y && \
+    apt install -y \
+        git cmake libjpeg-dev \
+        python3 python3-pip python3-dev && \
+    apt autoremove -y && \
+    apt clean -y && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+
+COPY --from=collector /llama.cpp/bin /app
+COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
+
+RUN pip install --no-cache-dir --break-system-packages \
+        -r /app/gguf-py/requirements.txt
+
+ENTRYPOINT [ "/app/tools.sh" ]
+
+
+### CLI Only
+FROM --platform=linux/s390x base AS light
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
+
+
+### Server
+FROM --platform=linux/s390x base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+WORKDIR /llama.cpp/bin
+
+# Copy llama.cpp binaries and libraries
+COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
+
+EXPOSE 8080
+
+ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
--- a/.editorconfig
+++ b/.editorconfig
@@ -52,3 +52,11 @@ insert_final_newline = unset
 [vendor/miniaudio/miniaudio.h]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
+
+[tools/server/webui/**]
+indent_style = unset
+indent_size = unset
+end_of_line = unset
+charset = unset
+trim_trailing_whitespace = unset
+insert_final_newline = unset
--- a/.github/workflows/build-riscv-native.yml
+++ b/.github/workflows/build-riscv-native.yml
@@ -6,7 +6,7 @@ on:

 jobs:
  debian-13-riscv64-native: # Bianbu 2.2
-    runs-on: self-hosted
+    runs-on: [self-hosted, RISCV64]

    steps:
      - name: Install prerequisites
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -56,7 +56,7 @@ env:

 jobs:
  macOS-latest-cmake-arm64:
-    runs-on: macos-14
+    runs-on: macos-latest

    steps:
      - name: Clone
@@ -88,6 +88,7 @@ jobs:
            -DGGML_METAL_SHADER_DEBUG=ON \
            -DGGML_RPC=ON
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
+          leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1

      - name: Test
        id: cmake_test
@@ -126,7 +127,8 @@ jobs:
            -DCMAKE_BUILD_RPATH="@loader_path" \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -136,7 +138,7 @@ jobs:
          ctest -L main --verbose --timeout 900

  macOS-latest-cmake-arm64-webgpu:
-    runs-on: macos-14
+    runs-on: macos-latest

    steps:
      - name: Clone
@@ -709,6 +711,7 @@ jobs:

  macOS-latest-swift:
    runs-on: macos-latest
+    needs: ios-xcode-build

    strategy:
      matrix:
@@ -725,6 +728,12 @@ jobs:
          key: macOS-latest-swift
          evict-old-files: 1d

+      - name: Download xcframework artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: llama-xcframework
+          path: build-apple/llama.xcframework/
+
      - name: Dependencies
        id: depends
        continue-on-error: true
@@ -746,11 +755,6 @@ jobs:
            -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          ./build-xcframework.sh
-
  windows-msys2:
    runs-on: windows-2025

@@ -1050,9 +1054,13 @@ jobs:
        run:  examples/sycl/win-build-sycl.bat

  windows-latest-cmake-hip:
-    if: ${{ github.event.inputs.create_release != 'true' }}
    runs-on: windows-2022

+    env:
+      # The ROCm version must correspond to the version used in the HIP SDK.
+      ROCM_VERSION: "6.4.2"
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+
    steps:
      - name: Clone
        id: checkout
@@ -1061,23 +1069,46 @@ jobs:
      - name: Clone rocWMMA repository
        id: clone_rocwmma
        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+          git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1

-      - name: Install
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $proc.WaitForExit(600000)
+          $completed = $proc.WaitForExit(600000)
+          if (-not $completed) {
+              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+              $proc.Kill()
+              exit 1
+          }
+          if ($proc.ExitCode -ne 0) {
+              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+              exit 1
+          }
          write-host "Completed AMD HIP SDK installation"

      - name: Verify ROCm
        id: verify
        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version

      - name: Install ccache
        uses: ggml-org/ccache-action@v1.2.16
@@ -1141,8 +1172,17 @@ jobs:
        run: |
          ./build-xcframework.sh

+      - name: Upload xcframework artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: llama-xcframework
+          path: build-apple/llama.xcframework/
+          retention-days: 1
+
      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
+        run: |
+          xcodebuild -downloadPlatform iOS
+          xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build

  android-build:
    runs-on: ubuntu-latest
@@ -1207,3 +1247,167 @@ jobs:
              -DGGML_CANN=on \
              -DSOC_TYPE=${{ matrix.device }}
          cmake --build build -j $(nproc)
+
+# TODO: simplify the following workflows using a matrix
+# TODO: run lighter CI on PRs and the full CI only on master (if needed)
+  ggml-ci-x64-cpu-low-perf:
+    runs-on: [self-hosted, Linux, X64, CPU, low-perf]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-arm64-cpu-low-perf:
+    runs-on: [self-hosted, Linux, ARM64, CPU, low-perf]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-cpu-high-perf:
+    runs-on: [self-hosted, Linux, X64, CPU, high-perf]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-arm64-cpu-high-perf:
+    runs-on: [self-hosted, Linux, ARM64, CPU, high-perf]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-nvidia-cuda:
+    runs-on: [self-hosted, Linux, X64, NVIDIA]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          nvidia-smi
+          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-nvidia-vulkan-cm:
+    runs-on: [self-hosted, Linux, X64, NVIDIA]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-nvidia-vulkan-cm2:
+    runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-x64-cpu-amx:
+    runs-on: [self-hosted, Linux, X64, CPU, AMX]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+#  ggml-ci-x64-amd-vulkan:
+#    runs-on: [self-hosted, Linux, X64, AMD]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          vulkaninfo --summary
+#          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+#
+#  ggml-ci-x64-amd-rocm:
+#    runs-on: [self-hosted, Linux, X64, AMD]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v4
+#
+#      - name: Test
+#        id: ggml-ci
+#        run: |
+#          amd-smi static
+#          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+
+  ggml-ci-mac-metal:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
+
+  ggml-ci-mac-vulkan:
+    runs-on: [self-hosted, macOS, ARM64]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Test
+        id: ggml-ci
+        run: |
+          vulkaninfo --summary
+          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp ~/mnt/llama.cpp
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -17,7 +17,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
-          exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
+          exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -44,6 +44,7 @@ jobs:
          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
    steps:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -108,7 +108,8 @@ jobs:
            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DGGML_METAL=OFF \
-            -DGGML_RPC=ON
+            -DGGML_RPC=ON \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Determine tag name
@@ -528,11 +529,14 @@ jobs:
  windows-hip:
    runs-on: windows-2022

+    env:
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+
    strategy:
      matrix:
        include:
          - name: "radeon"
-            gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+            gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"

    steps:
      - name: Clone
@@ -542,29 +546,52 @@ jobs:
      - name: Clone rocWMMA repository
        id: clone_rocwmma
        run: |
-          git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
+          git clone https://github.com/rocm/rocwmma --branch develop --depth 1
+
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
-          key: windows-latest-cmake-hip-${{ matrix.name }}-x64
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
          evict-old-files: 1d

-      - name: Install
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
        id: depends
        run: |
          $ErrorActionPreference = "Stop"
          write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
          write-host "Installing AMD HIP SDK"
          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
-          $proc.WaitForExit(600000)
+          $completed = $proc.WaitForExit(600000)
+          if (-not $completed) {
+              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+              $proc.Kill()
+              exit 1
+          }
+          if ($proc.ExitCode -ne 0) {
+              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+              exit 1
+          }
          write-host "Completed AMD HIP SDK installation"

      - name: Verify ROCm
        id: verify
        run: |
-          & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version

      - name: Build
        id: cmake_build
@@ -585,9 +612,12 @@ jobs:
            -DLLAMA_CURL=OFF
          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"

      - name: Pack artifacts
        id: pack_artifacts
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -76,51 +76,206 @@ jobs:
        run: |
          pip install -r tools/server/tests/requirements.txt

-      # Setup nodejs (to be used for verifying bundled index.html)
-      - uses: actions/setup-node@v4
+  webui-setup:
+    name: WebUI Setup
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
        with:
-          node-version: '22.11.0'
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: WebUI - Install dependencies
-        id: webui_lint
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"
+
+      - name: Cache node_modules
+        uses: actions/cache@v4
+        id: cache-node-modules
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Install dependencies
+        if: steps.cache-node-modules.outputs.cache-hit != 'true'
+        run: npm ci
+        working-directory: tools/server/webui
+
+  webui-check:
+    needs: webui-setup
+    name: WebUI Check
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Restore node_modules cache
+        uses: actions/cache@v4
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Run type checking
+        run: npm run check
+        working-directory: tools/server/webui
+
+      - name: Run linting
+        run: npm run lint
+        working-directory: tools/server/webui
+
+  webui-build:
+    needs: webui-check
+    name: WebUI Build
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Restore node_modules cache
+        uses: actions/cache@v4
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Build application
+        run: npm run build
+        working-directory: tools/server/webui
+
+  webui-tests:
+    needs: webui-build
+    name: Run WebUI tests
+    permissions:
+      contents: read
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+
+      - name: Restore node_modules cache
+        uses: actions/cache@v4
+        with:
+          path: tools/server/webui/node_modules
+          key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }}
+          restore-keys: |
+            ${{ runner.os }}-node-modules-
+
+      - name: Install Playwright browsers
+        run: npx playwright install --with-deps
+        working-directory: tools/server/webui
+
+      - name: Build Storybook
+        run: npm run build-storybook
+        working-directory: tools/server/webui
+
+      - name: Run Client tests
+        run: npm run test:client
+        working-directory: tools/server/webui
+
+      - name: Run Server tests
+        run: npm run test:server
+        working-directory: tools/server/webui
+
+      - name: Run UI tests
+        run: npm run test:ui
+        working-directory: tools/server/webui
+
+      - name: Run E2E tests
+        run: npm run test:e2e
+        working-directory: tools/server/webui
+
+  server-build:
+    needs: [webui-tests]
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
+        build_type: [RelWithDebInfo]
+        include:
+          - build_type: Release
+            sanitizer: ""
+      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
+
+    steps:
+      - name: Dependencies
+        id: depends
        run: |
-          cd tools/server/webui
-          npm ci
+          sudo apt-get update
+          sudo apt-get -y install \
+            build-essential \
+            xxd \
+            git \
+            cmake \
+            curl \
+            wget \
+            language-pack-en \
+            libcurl4-openssl-dev

-      - name: WebUI - Check code format
-        id: webui_format
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
+
+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Tests dependencies
+        id: test_dependencies
        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
+          pip install -r tools/server/tests/requirements.txt

-          npm run format
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Files do not follow coding style. To fix: npm run format"
-            echo "${modified_files}"
-            exit 1
-          fi
+      - name: Setup Node.js for WebUI
+        uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: "npm"
+          cache-dependency-path: "tools/server/webui/package-lock.json"

-      - name: Verify bundled index.html
-        id: verify_server_index_html
-        run: |
-          git config --global --add safe.directory $(realpath .)
-          cd tools/server/webui
-          git status
+      - name: Install WebUI dependencies
+        run: npm ci
+        working-directory: tools/server/webui

-          npm run build
-          git status
-          modified_files="$(git status -s)"
-          echo "Modified files: ${modified_files}"
-          if [ -n "${modified_files}" ]; then
-            echo "Repository is dirty or server/webui is not built as expected"
-            echo "Hint: You may need to follow Web UI build guide in server/README.md"
-            echo "${modified_files}"
-            exit 1
-          fi
+      - name: Build WebUI
+        run: npm run build
+        working-directory: tools/server/webui

      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
--- a/.gitignore
+++ b/.gitignore
@@ -148,3 +148,7 @@ poetry.toml
 /run-vim.sh
 /run-chat.sh
 .ccache/
+
+# Code Workspace
+*.code-workspace
+
--- a/.windsurf/rules/css-architecture.md
+++ b/.windsurf/rules/css-architecture.md
@@ -0,0 +1,7 @@
+---
+trigger: manual
+---
+
+#### Tailwind & CSS
+
+-   We are using Tailwind v4 which uses oklch colors so we now want to refer to the CSS vars directly, without wrapping it with any color function like `hsla/hsl`, `rgba` etc.
--- a/.windsurf/rules/sveltekit-architecture.md
+++ b/.windsurf/rules/sveltekit-architecture.md
@@ -0,0 +1,48 @@
+---
+trigger: manual
+---
+
+# Coding rules
+
+## Svelte & SvelteKit
+
+### Services vs Stores Separation Pattern
+
+#### `lib/services/` - Pure Business Logic
+
+-   **Purpose**: Stateless business logic and external communication
+-   **Contains**:
+    -   API calls to external services (ApiService)
+    -   Pure business logic functions (ChatService, etc.)
+-   **Rules**:
+    -   NO Svelte runes ($state, $derived, $effect)
+    -   NO reactive state management
+    -   Pure functions and classes only
+    -   Can import types but not stores
+    -   Focus on "how" - implementation details
+
+#### `lib/stores/` - Reactive State Management
+
+-   **Purpose**: Svelte-specific reactive state with runes
+-   **Contains**:
+    -   Reactive state classes with $state, $derived, $effect
+    -   Database operations (DatabaseStore)
+    -   UI-focused state management
+    -   Store orchestration logic
+-   **Rules**:
+    -   USE Svelte runes for reactivity
+    -   Import and use services for business logic
+    -   NO direct database operations
+    -   NO direct API calls (use services)
+    -   Focus on "what" - reactive state for UI
+
+#### Enforcement
+
+-   Services should be testable without Svelte
+-   Stores should leverage Svelte's reactivity system
+-   Clear separation: services handle data, stores handle state
+-   Services can be reused across multiple stores
+
+#### Misc
+
+-   Always use `let` for $derived state variables
--- a/.windsurf/rules/tests.md
+++ b/.windsurf/rules/tests.md
@@ -0,0 +1,9 @@
+---
+trigger: manual
+---
+
+# Automated Tests
+
+## General rules
+
+-   NEVER include any test code in the production code - we should always have it in a separate dedicated files
--- a/.windsurf/rules/typescript-architecture.md
+++ b/.windsurf/rules/typescript-architecture.md
@@ -0,0 +1,7 @@
+---
+trigger: manual
+---
+
+## TypeScript
+
+-   Add JSDocs for functions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,12 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

+if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
+    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
+else()
+    set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
+endif()
+
 #
 # option list
 #
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_TOOLS    "llama: build tools"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_DEFAULT})

 # 3rd party libs
 option(LLAMA_CURL       "llama: use libcurl to download model from an URL" ON)
--- a/114
+++ b/114
@@ -1,12 +1,106 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
+# multiplie collaborators per item can be specified

-/ci/ @ggerganov
-/.devops/*.Dockerfile @ngxson
-/tools/server/ @ngxson
-/ggml/src/ggml-cuda/fattn* @JohannesGaessler
-/ggml/src/ggml-cuda/mmq.* @JohannesGaessler
-/ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
-/ggml/src/ggml-opt.cpp @JohannesGaessler
-/ggml/src/gguf.cpp @JohannesGaessler
-/ggml/src/ggml-vulkan/ @0cc4m
-/ggml/src/ggml-zdnn/ @taronaeo
+/.devops/*.Dockerfile                   @ngxson
+/.github/actions/                       @slaren
+/.github/workflows/                     @CISC
+/.github/workflows/release.yml          @slaren
+/.github/workflows/winget.yml           @slaren
+/ci/                                    @ggerganov
+/cmake/                                 @ggerganov
+/common/CMakeLists.txt                  @ggerganov
+/common/arg.*                           @ggerganov @ericcurtin
+/common/base64.hpp.*                    @ggerganov
+/common/build-info.*                    @ggerganov
+/common/common.*                        @ggerganov
+/common/console.*                       @ggerganov
+/common/llguidance.*                    @ggerganov
+/common/log.*                           @ggerganov
+/common/sampling.*                      @ggerganov
+/common/speculative.*                   @ggerganov
+/convert_*.py                           @CISC
+/examples/batched.swift/                @ggerganov
+/examples/batched/                      @ggerganov
+/examples/convert-llama2c-to-ggml/      @ggerganov
+/examples/deprecation-warning/          @ggerganov
+/examples/diffusion/                    @am17an
+/examples/embedding/                    @ggerganov
+/examples/eval-callback/                @ggerganov
+/examples/export-docs/                  @ggerganov
+/examples/gen-docs/                     @ggerganov
+/examples/gguf/                         @ggerganov
+/examples/llama.android/                @ggerganov
+/examples/llama.swiftui/                @ggerganov
+/examples/llama.vim                     @ggerganov
+/examples/lookahead/                    @ggerganov
+/examples/lookup/                       @JohannesGaessler
+/examples/model-conversion/             @danbev
+/examples/parallel/                     @ggerganov
+/examples/passkey/                      @ggerganov
+/examples/retrieval/                    @ggerganov
+/examples/save-load-state/              @ggerganov
+/examples/simple-chat/                  @slaren
+/examples/simple/                       @slaren
+/examples/speculative-simple/           @ggerganov
+/examples/speculative/                  @ggerganov
+/ggml/cmake/                            @ggerganov
+/ggml/include/                          @ggerganov @slaren
+/ggml/src/ggml-alloc.c                  @slaren
+/ggml/src/ggml-backend*                 @slaren
+/ggml/src/ggml-blas/                    @slaren
+/ggml/src/ggml-common.h                 @ggerganov @slaren
+/ggml/src/ggml-cpu/                     @ggerganov @slaren
+/ggml/src/ggml-cuda/common.cuh          @slaren
+/ggml/src/ggml-cuda/fattn*              @JohannesGaessler
+/ggml/src/ggml-cuda/ggml-cuda.cu        @slaren
+/ggml/src/ggml-cuda/mmf.*               @JohannesGaessler
+/ggml/src/ggml-cuda/mmq.*               @JohannesGaessler
+/ggml/src/ggml-cuda/mmvf.*              @JohannesGaessler
+/ggml/src/ggml-cuda/mmvq.*              @JohannesGaessler
+/ggml/src/ggml-impl.h                   @ggerganov @slaren
+/ggml/src/ggml-metal/                   @ggerganov
+/ggml/src/ggml-opt.cpp                  @JohannesGaessler
+/ggml/src/ggml-quants.*                 @ggerganov
+/ggml/src/ggml-threading.*              @ggerganov @slaren
+/ggml/src/ggml-vulkan/                  @0cc4m
+/ggml/src/ggml-zdnn/                    @taronaeo
+/ggml/src/ggml.c                        @ggerganov @slaren
+/ggml/src/ggml.cpp                      @ggerganov @slaren
+/ggml/src/gguf.cpp                      @JohannesGaessler @Green-Sky
+/gguf-py/                               @CISC
+/media/                                 @ggerganov
+/scripts/gen*                           @ggerganov
+/scripts/get*                           @ggerganov
+/scripts/sync*                          @ggerganov
+/src/                                   @ggerganov
+/src/llama-adapter.*                    @CISC
+/src/llama-arch.*                       @CISC
+/src/llama-chat.*                       @ngxson
+/src/llama-graph.*                      @CISC
+/src/llama-model-loader.*               @slaren
+/src/llama-model.*                      @CISC
+/src/llama-vocab.*                      @CISC
+/tests/                                 @ggerganov
+/tests/test-backend-ops.cpp             @slaren
+/tests/test-thread-safety.cpp           @slaren
+/tools/batched-bench/                   @ggerganov
+/tools/llama-bench/                     @slaren
+/tools/main/                            @ggerganov
+/tools/mtmd/                            @ngxson
+/tools/perplexity/                      @ggerganov
+/tools/quantize/                        @ggerganov
+/tools/run/                             @ericcurtin
+/tools/server/*                         @ngxson @ggerganov @ericcurtin # no subdir
+/tools/server/webui/                    @allozaur
+/tools/tokenize/                        @ggerganov
+/tools/tts/                             @ggerganov
+/vendor/                                @ggerganov
+/.clang-format                          @slaren
+/.clang-tidy                            @slaren
+/AUTHORS                                @ggerganov
+/CMakeLists.txt                         @ggerganov
+/CONTRIBUTING.md                        @ggerganov
+/LICENSE                                @ggerganov
+/README.md                              @ggerganov
+/SECURITY.md                            @ggerganov
+requirements*.txt                       @CISC
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,12 @@
-# Pull requests (for contributors)
+# Contributors
+
+The project differentiates between 3 levels of contributors:
+
+- Contributors: people who have contributed before (no special privileges)
+- Collaborators (Triage): people with significant contributions, who may be responsible for some parts of the code, and are expected to maintain and review contributions for the code they own
+- Maintainers: responsible for reviewing and merging PRs, after approval from the code owners
+
+# Pull requests (for contributors & collaborators)

 - llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
 - Test your changes:
@@ -9,13 +17,17 @@
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+- Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
+- Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs

-# Pull requests (for collaborators)
+# Pull requests (for maintainers)

 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
+- Let other maintainers, merge their own PRs
+- When merging a PR, make sure you have a good understanding of the changes
+- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)

 # Coding guidelines

@@ -114,6 +126,21 @@
    #endif // FOO
    ```

+# Code maintenance
+
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
+  - Reviewing and merging related PRs
+  - Fixing related bugs
+  - Providing developer guidance/support
+
+- When adding or modifying a large piece of code:
+  - If you are a collaborator, make sure to add yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
+  - If you are a contributor, find an existing collaborator who is willing to review and maintain your code long-term
+  - Provide the necessary CI workflow (and hardware) to test your changes (see [ci/README.md](https://github.com/ggml-org/llama.cpp/tree/master/ci))
+
+- New code should follow the guidelines (coding, naming, etc.) outlined in this document. Exceptions are allowed in isolated, backend-specific parts of the code that do not interface directly with the `ggml` interfaces.
+  _(NOTE: for legacy reasons, existing code is not required to follow this guideline)_
+
 # Documentation

 - Documentation is a community effort
--- a/README.md
+++ b/README.md
@@ -137,6 +137,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
 - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
 - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
+- [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7)

 #### Multimodal

@@ -273,6 +274,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [Vulkan](docs/build.md#vulkan) | GPU |
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
+| [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
 | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |

@@ -519,8 +521,8 @@ To learn more about model quantization, [read this documentation](tools/quantize
 ## Contributing

 - Contributors can open PRs
- Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
+- Maintainers can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Any help with managing issues, PRs and projects is very appreciated!
 - See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
--- a/ci/README-MUSA.md
+++ b/ci/README-MUSA.md
@@ -0,0 +1,35 @@
+## Running MUSA CI in a Docker Container
+
+Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
+
+### 1. Create a local directory to store cached models, configuration files and venv:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-cache
+```
+
+### 2. Create a local directory to store CI run results:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-results
+```
+
+### 3. Start a Docker container and run the CI:
+
+```bash
+docker run --privileged -it \
+    -v $HOME/llama.cpp/ci-cache:/ci-cache \
+    -v $HOME/llama.cpp/ci-results:/ci-results \
+    -v $PWD:/ws -w /ws \
+    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
+```
+
+Inside the container, execute the following commands:
+
+```bash
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
+git config --global --add safe.directory /ws
+GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
+```
+
+This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,18 +1,10 @@
 # CI

-In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+This CI implements heavy-duty workflows that run on self-hosted runners. Typically the purpose of these workflows is to
+cover hardware configurations that are not available from Github-hosted runners and/or require more computational
+resource than normally available.

-https://github.com/ggml-org/ci
-
-It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
-to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
-to cover various hardware architectures, including GPU and Apple Silicon instances.
-
-Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
-Only the branches of this repo are monitored for this keyword.
-
-It is a good practice, before publishing changes to execute the full CI locally on your machine:
+It is a good practice, before publishing changes to execute the full CI locally on your machine. For example:

 ```bash
 mkdir tmp
@@ -29,40 +21,13 @@ GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

 # with MUSA support
 GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# etc.
 ```

-## Running MUSA CI in a Docker Container
+# Adding self-hosted runners

-Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
-
-### 1. Create a local directory to store cached models, configuration files and venv:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-cache
-```
-
-### 2. Create a local directory to store CI run results:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-results
-```
-
-### 3. Start a Docker container and run the CI:
-
-```bash
-docker run --privileged -it \
-    -v $HOME/llama.cpp/ci-cache:/ci-cache \
-    -v $HOME/llama.cpp/ci-results:/ci-results \
-    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
-```
-
-Inside the container, execute the following commands:
-
-```bash
-apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
-git config --global --add safe.directory /ws
-GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
-```
-
-This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
+- Add a self-hosted `ggml-ci` workflow to [[.github/workflows/build.yml]] with an appropriate label
+- Request a runner token from `ggml-org` (for example, via a comment in the PR or email)
+- Set-up a machine using the received token ([docs](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners))
+- Optionally update [ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) to build and run on the target platform by gating the implementation with a `GG_BUILD_...` env
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,7 +45,7 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"

 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
@@ -65,6 +65,16 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
    fi
 fi

+if [ ! -z ${GG_BUILD_ROCM} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
+    if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
+        echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
+        exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
+fi
+
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
@@ -82,6 +92,12 @@ fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+
+    # if on Mac, disable METAL
+    if [[ "$OSTYPE" == "darwin"* ]]; then
+        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
+    fi
+
 fi

 if [ ! -z ${GG_BUILD_WEBGPU} ]; then
@@ -150,7 +166,7 @@ function gg_run_ctest_debug {
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log

-    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    (time ctest --output-on-failure -L main -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
 }
@@ -200,33 +216,9 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

-# test_scripts_debug
+# test_scripts

-function gg_run_test_scripts_debug {
-    cd ${SRC}
-
-    set -e
-
-    (cd ./tools/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./tools/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-
-    set +e
-}
-
-function gg_sum_test_scripts_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs test scripts in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
-# test_scripts_release
-
-function gg_run_test_scripts_release {
+function gg_run_test_scripts {
    cd ${SRC}

    set -e
@@ -237,10 +229,10 @@ function gg_run_test_scripts_release {
    set +e
 }

-function gg_sum_test_scripts_release {
+function gg_sum_test_scripts {
    gg_printf '### %s\n\n' "${ci}"

-    gg_printf 'Runs test scripts in release mode\n'
+    gg_printf 'Runs test scripts\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
@@ -249,15 +241,9 @@ function gg_sum_test_scripts_release {
 }

 function gg_get_model {
-    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    local gguf_0="$MNT/models/qwen3/0.6B/ggml-model-f16.gguf"
    if [[ -s $gguf_0 ]]; then
        echo -n "$gguf_0"
-    elif [[ -s $gguf_1 ]]; then
-        echo -n "$gguf_1"
-    elif [[ -s $gguf_2 ]]; then
-        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@@ -270,7 +256,9 @@ function gg_run_ctest_with_model_debug {
    local model; model=$(gg_get_model)
    cd build-ci-debug
    set -e
+
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
    set +e
    cd ..
 }
@@ -281,7 +269,15 @@ function gg_run_ctest_with_model_release {
    local model; model=$(gg_get_model)
    cd build-ci-release
    set -e
+
    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
+    # test memory leaks
+    #if [[ ! -z ${GG_BUILD_METAL} ]]; then
+    #    # TODO: this hangs for some reason ...
+    #    (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
+    #fi
+
    set +e
    cd ..
 }
@@ -306,24 +302,22 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }

-# open_llama_7b_v2
+# qwen3_0_6b

-function gg_run_open_llama_7b_v2 {
+function gg_run_qwen3_0_6b {
    cd ${SRC}

-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/config.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/tokenizer_config.json
+   #gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/raw/main/special_tokens_map.json
+    gg_wget models-mnt/qwen3/0.6B/ https://huggingface.co/Qwen/Qwen3-0.6B-Base/resolve/main/model.safetensors
+

    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/

-    path_models="../models-mnt/open-llama/7B-v2"
+    path_models="../models-mnt/qwen3/0.6B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"

    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
@@ -333,9 +327,11 @@ function gg_run_open_llama_7b_v2 {
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log

-    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf  --outtype f16
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-bf16.gguf --outtype bf16

    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_bf16="${path_models}/ggml-model-bf16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
@@ -349,179 +345,51 @@ function gg_run_open_llama_7b_v2 {

    wiki_test="${path_wiki}/wiki.test.raw"

-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k

-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        (time ./bin/llama-perplexity --model ${model_bf16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
+    fi
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_open_llama_7b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 7B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_1.4b
-
-function gg_run_pythia_1_4b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/pythia/1.4B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on  ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -537,6 +405,9 @@ function gg_run_pythia_1_4b {
    }

    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        check_ppl "bf16" "$(cat $OUT/${ci}-tg-bf16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    fi
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
@@ -553,147 +424,17 @@ function gg_run_pythia_1_4b {
    set +e
 }

-function gg_sum_pythia_1_4b {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Pythia 1.4B:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
-# pythia_2_8b
-
-function gg_run_pythia_2_8b {
-    cd ${SRC}
-
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
-    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
-
-    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/pythia/2.8B"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    set +e
-}
-
-function gg_sum_pythia_2_8b {
+function gg_sum_qwen3_0_6b {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'Pythia 2.8B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- f16:\n```\n%s\n```\n'  "$(cat $OUT/${ci}-tg-f16.log)"
+    if [ -z ${GG_BUILD_NO_BF16} ]; then
+        gg_printf '- bf16:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-bf16.log)"
+    fi
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
@@ -860,10 +601,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-if [ -z ${GG_BUILD_SYCL} ]; then
-    # SYCL build breaks with debug build flags
-    test $ret -eq 0 && gg_run ctest_debug
-fi
+test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -871,24 +609,13 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run test_scripts_debug
-        fi
-        test $ret -eq 0 && gg_run test_scripts_release
+        test $ret -eq 0 && gg_run test_scripts
    fi

-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
-            test $ret -eq 0 && gg_run pythia_1_4b
-        else
-            test $ret -eq 0 && gg_run pythia_2_8b
-            #test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run ctest_with_model_debug
-        fi
-        test $ret -eq 0 && gg_run ctest_with_model_release
-    fi
+    test $ret -eq 0 && gg_run qwen3_0_6b
+
+    test $ret -eq 0 && gg_run ctest_with_model_debug
+    test $ret -eq 0 && gg_run ctest_with_model_release
 fi

 exit $ret
--- a/common/arg.cpp
+++ b/common/arg.cpp
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
    throw std::runtime_error("Invalid tool_choice: " + tool_choice);
 }

+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
+    common_chat_templates_inputs dummy_inputs;
+    common_chat_msg msg;
+    msg.role = "user";
+    msg.content = "test";
+    dummy_inputs.messages = {msg};
+    dummy_inputs.enable_thinking = false;
+    const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+    dummy_inputs.enable_thinking = true;
+    const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
+    return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
+}
+
 template <>
 std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
    std::vector<common_chat_msg> msgs;
@@ -618,11 +631,13 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
+        case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
        case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
        case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
        case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
        case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
        case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
+        case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -684,11 +699,13 @@ static void parse_json_tool_calls(
        size_t from = std::string::npos;
        auto first = true;
        while (true) {
+            auto start_pos = builder.pos();
            auto res = function_regex_start_only && first
                ? builder.try_consume_regex(*function_regex_start_only)
                : function_regex
                    ? builder.try_find_regex(*function_regex, from)
                    : std::nullopt;
+
            if (res) {
                std::string name;
                if (get_function_name) {
@@ -723,6 +740,8 @@ static void parse_json_tool_calls(
                    return;
                }
                throw common_chat_msg_partial_exception("incomplete tool call");
+            } else {
+                builder.move_to(start_pos);
            }
            break;
        }
@@ -1184,6 +1203,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
    });
    return data;
 }
+
+static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+
+    // Generate the prompt using the apply() function with the template
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
+
+    // Handle thinking tags appropriately based on inputs.enable_thinking
+    if (string_ends_with(data.prompt, "<think>\n")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+
+    // When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
+    if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = true;
+        data.grammar      = build_grammar([&](const common_grammar_builder & builder) {
+            auto schemas = json::array();
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                schemas.push_back({
+                    { "type",       "object"                                                   },
+                    { "properties",
+                        {
+                            { "name",
+                            {
+                                { "type", "string" },
+                                { "const", function.at("name") },
+                            } },
+                            { "arguments", function.at("parameters") },
+                        }                                                                        },
+                    { "required",   json::array({ "name", "arguments" }) },
+                });
+            });
+            auto schema = json{
+                        { "type",     "array"                                                         },
+                        { "items",    schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
+                        { "minItems", 1                                                               },
+            };
+            if (!inputs.parallel_tool_calls) {
+                schema["maxItems"] = 1;
+            }
+            builder.add_rule("root",
+                                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                                    "\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
+                                    " \"</TOOLCALL>\"");
+        });
+        data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+            // If thinking_forced_open, then we capture the </think> tag in the grammar,
+            // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+            std::string(data.thinking_forced_open ?
+                            "[\\s\\S]*?(</think>\\s*)" :
+                            "(?:<think>[\\s\\S]*?</think>\\s*)?") +
+                "(<TOOLCALL>)[\\s\\S]*" });
+    }
+    return data;
+}
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
    if (!builder.syntax().parse_tool_calls) {
        builder.add_content(builder.consume_rest());
@@ -1313,6 +1393,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
    }
    return data;
 }
+
+static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+
+    // Pass thinking context for DeepSeek V3.1 template
+    json additional_context = {
+        {"thinking", inputs.enable_thinking},
+    };
+
+    auto prompt = apply(tmpl, inputs,
+                       /* messages_override= */ inputs.messages,
+                       /* tools_override= */ std::nullopt,
+                       additional_context);
+    data.prompt = prompt;
+    data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    if (string_ends_with(data.prompt, "<think>")) {
+        if (!inputs.enable_thinking) {
+            data.prompt += "</think>";
+        } else {
+            data.thinking_forced_open = true;
+        }
+    }
+    if (inputs.tools.is_array() && !inputs.tools.empty()) {
+        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
+        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+            std::vector<std::string> tool_rules;
+            foreach_function(inputs.tools, [&](const json & tool) {
+                const auto & function = tool.at("function");
+                std::string name = function.at("name");
+                auto parameters = function.at("parameters");
+                builder.resolve_refs(parameters);
+                tool_rules.push_back(builder.add_rule(name + "-call",
+                    "( \"<｜tool▁call▁begin｜>\" )? \"" + name + "<｜tool▁sep｜>"
+                    "\" " + builder.add_schema(name + "-args", parameters) + " "
+                    "\"<｜tool▁call▁end｜>\""));
+            });
+            // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
+            // so we accept common variants (then it's all constrained)
+            builder.add_rule("root",
+                std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
+                "( \"<｜tool▁calls▁begin｜>\" | \"<｜tool_calls_begin｜>\" | \"<｜tool calls begin｜>\" | \"<｜tool\\\\_calls\\\\_begin｜>\" | \"<｜tool▁calls｜>\" ) "
+                "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
+                "\"<｜tool▁calls▁end｜>\""
+                " space");
+            data.grammar_triggers.push_back({
+                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
+                // If thinking_forced_open, then we capture the </think> tag in the grammar,
+                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
+                    "(<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)[\\s\\S]*"
+            });
+            data.preserved_tokens = {
+                "<think>",
+                "</think>",
+                "<｜tool▁calls▁begin｜>",
+                "<｜tool▁call▁begin｜>",
+                "<｜tool▁sep｜>",
+                "<｜tool▁call▁end｜>",
+                "<｜tool▁calls▁end｜>",
+            };
+        });
+    }
+    return data;
+}
+
 static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
    builder.try_parse_reasoning("<think>", "</think>");
    if (!builder.syntax().parse_tool_calls) {
@@ -1334,6 +1479,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
        tool_calls_end);
 }

+static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
+    static const common_regex function_regex("(?:<｜tool▁call▁begin｜>)?([^\\n<]+)(?:<｜tool▁sep｜>)");
+
+    static const common_regex close_regex("(?:[\\s]*)?<｜tool▁call▁end｜>");
+    static const common_regex tool_calls_begin("(?:<｜tool▁calls▁begin｜>|<｜tool_calls_begin｜>|<｜tool calls begin｜>|<｜tool\\\\_calls\\\\_begin｜>|<｜tool▁calls｜>)");
+    static const common_regex tool_calls_end("<｜tool▁calls▁end｜>");
+
+    if (!builder.syntax().parse_tool_calls) {
+        LOG_DBG("%s: not parse_tool_calls\n", __func__);
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    LOG_DBG("%s: parse_tool_calls\n", __func__);
+
+    parse_json_tool_calls(
+        builder,
+        /* block_open= */ tool_calls_begin,
+        /* function_regex_start_only= */ std::nullopt,
+        function_regex,
+        close_regex,
+        tool_calls_end);
+}
+
+static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
+    // DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
+    // First try to parse using the standard reasoning parsing method
+    LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
+
+    auto start_pos = builder.pos();
+    auto found_end_think = builder.try_find_literal("</think>");
+    builder.move_to(start_pos);
+
+    if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
+        LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
+        common_chat_parse_deepseek_v3_1_content(builder);
+    } else if (builder.try_parse_reasoning("<think>", "</think>")) {
+        // If reasoning was parsed successfully, the remaining content is regular content
+        LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
+        // </think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>NAME\n```json\nJSON\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜>
+        common_chat_parse_deepseek_v3_1_content(builder);
+    } else {
+        if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+          LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
+          common_chat_parse_deepseek_v3_1_content(builder);
+          return;
+        }
+        // If no reasoning tags found, check if we should treat everything as reasoning
+        if (builder.syntax().thinking_forced_open) {
+            // If thinking is forced open but no tags found, treat everything as reasoning
+            LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
+            builder.add_reasoning_content(builder.consume_rest());
+        } else {
+            LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
+            // <｜tool▁call▁begin｜>NAME<｜tool▁sep｜>JSON<｜tool▁call▁end｜>
+            common_chat_parse_deepseek_v3_1_content(builder);
+        }
+    }
+}
+
 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    auto prompt = apply(tmpl, inputs);
@@ -1536,10 +1741,12 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
 static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
    LOG_DBG("%s\n", __func__);
    common_chat_params data;
-    data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
+    const std::optional<json> tools_override = json();
+    const std::optional<json> additional_context = json {
        {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
        {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
-    });
+    };
+    data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, tools_override, additional_context);
    if (inputs.tools.is_array() && !inputs.tools.empty()) {
        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -1830,7 +2037,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
                // If thinking_forced_open, then we capture the </think> tag in the grammar,
                // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
                std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
-                    "(\\s*"
+                    "\\s*("
                    "(?:<tool_call>"
                    "|<function"
                    "|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2025,15 +2232,28 @@ static common_chat_params common_chat_params_init_granite(const common_chat_temp

 static void common_chat_parse_granite(common_chat_msg_parser & builder) {
    // Parse thinking tags
+    static const common_regex start_think_regex(regex_escape("<think>"));
+    static const common_regex end_think_regex(regex_escape("</think>"));
+    // Granite models output partial tokens such as "<" and "<think".
+    // By leveraging try_consume_regex()/try_find_regex() throwing
+    // common_chat_msg_partial_exception for these partial tokens,
+    // processing is interrupted and the tokens are not passed to add_content().
+    if (auto res = builder.try_consume_regex(start_think_regex)) {
+        // Restore position for try_parse_reasoning()
+        builder.move_to(res->groups[0].begin);
+        builder.try_find_regex(end_think_regex, std::string::npos, false);
+        // Restore position for try_parse_reasoning()
+        builder.move_to(res->groups[0].begin);
+    }
    builder.try_parse_reasoning("<think>", "</think>");

-    // Parse response tags using regex
-    static const common_regex response_regex("<response>([\\s\\S]*?)</response>");
-    if (auto res = builder.try_find_regex(response_regex)) {
-        // Extract the content between the tags (capture group 1)
-        auto content = builder.str(res->groups[1]);
-        builder.add_content(content);
-        builder.move_to(res->groups[0].end);
+    // Parse response tags
+    static const common_regex start_response_regex(regex_escape("<response>"));
+    static const common_regex end_response_regex(regex_escape("</response>"));
+    // Granite models output partial tokens such as "<" and "<response".
+    // Same hack as reasoning parsing.
+    if (builder.try_consume_regex(start_response_regex)) {
+        builder.try_find_regex(end_response_regex);
    }

    if (!builder.syntax().parse_tool_calls) {
@@ -2047,19 +2267,43 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
        builder.move_to(res->groups[0].end);

        // Expect JSON array of tool calls
-        auto tool_calls_data = builder.consume_json();
-        if (tool_calls_data.json.is_array()) {
-            if (!builder.add_tool_calls(tool_calls_data.json)) {
-                builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
+        if (auto tool_call = builder.try_consume_json_with_dumped_args({{{"arguments"}}})) {
+            if (!builder.add_tool_calls(tool_call->value) || tool_call->is_partial) {
+                throw common_chat_msg_partial_exception("incomplete tool call");
            }
-        } else {
-            builder.add_content("<|tool_call|>" + tool_calls_data.json.dump());
        }
    } else {
        builder.add_content(builder.consume_rest());
    }
 }

+static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
+    // Parse thinking tags
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    // Look for tool calls
+    static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
+    if (auto res = builder.try_find_regex(tool_call_regex)) {
+        builder.move_to(res->groups[0].end);
+
+        // Expect JSON array of tool calls
+        auto tool_calls_data = builder.consume_json();
+        if (tool_calls_data.json.is_array()) {
+            if (!builder.try_consume_literal("</TOOLCALL>")) {
+                throw common_chat_msg_partial_exception("Incomplete tool call");
+            }
+            builder.add_tool_calls(tool_calls_data.json);
+        } else {
+            throw common_chat_msg_partial_exception("Incomplete tool call");
+        }
+    }
+    builder.add_content(builder.consume_rest());
+}
+
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
    // Parse thinking tags first - this handles the main reasoning content
    builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2263,6 +2507,12 @@ static common_chat_params common_chat_templates_apply_jinja(
        }
    }

+    // DeepSeek V3.1: detect based on specific patterns in the template
+    if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
+        params.json_schema.is_null()) {
+        return common_chat_params_init_deepseek_v3_1(tmpl, params);
+    }
+
    // DeepSeek R1: use handler in all cases except json schema (thinking / tools).
    if (src.find("<｜tool▁calls▁begin｜>") != std::string::npos && params.json_schema.is_null()) {
        return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2293,6 +2543,11 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_seed_oss(tmpl, params, inputs);
    }

+    // Nemotron v2
+    if (src.find("<SPECIAL_10>") != std::string::npos) {
+        return common_chat_params_init_nemotron_v2(tmpl, params);
+    }
+
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2430,6 +2685,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
            common_chat_parse_deepseek_r1(builder);
            break;
+        case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
+            common_chat_parse_deepseek_v3_1(builder);
+            break;
        case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
            common_chat_parse_functionary_v3_2(builder);
            break;
@@ -2454,6 +2712,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_SEED_OSS:
            common_chat_parse_seed_oss(builder);
            break;
+        case COMMON_CHAT_FORMAT_NEMOTRON_V2:
+            common_chat_parse_nemotron_v2(builder);
+            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.h
+++ b/common/chat.h
@@ -107,11 +107,13 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
    COMMON_CHAT_FORMAT_HERMES_2_PRO,
    COMMON_CHAT_FORMAT_COMMAND_R7B,
    COMMON_CHAT_FORMAT_GRANITE,
    COMMON_CHAT_FORMAT_GPT_OSS,
    COMMON_CHAT_FORMAT_SEED_OSS,
+    COMMON_CHAT_FORMAT_NEMOTRON_V2,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
@@ -198,6 +200,8 @@ common_chat_msg           common_chat_parse(const std::string & input, bool is_p

 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

+bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
+
 // Parses a JSON array of messages in OpenAI's chat completion API format.
 // T can be std::string containing JSON or nlohmann::ordered_json
 template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -14,6 +14,7 @@
 #include <climits>
 #include <cmath>
 #include <codecvt>
+#include <chrono>
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
@@ -901,7 +902,8 @@ struct common_init_result common_init_from_params(common_params & params) {

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
        return iparams;
    }

@@ -911,7 +913,8 @@ struct common_init_result common_init_from_params(common_params & params) {

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
        llama_model_free(model);
        return iparams;
    }
@@ -1157,10 +1160,10 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
    cparams.attention_type    = params.attention_type;
+    cparams.flash_attn_type   = params.flash_attn_type;
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
    cparams.offload_kqv       = !params.no_kv_offload;
-    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
    cparams.op_offload        = !params.no_op_offload;
    cparams.swa_full          = params.swa_full;
--- a/common/common.h
+++ b/common/common.h
@@ -193,10 +193,11 @@ struct common_params_sampling {
 };

 struct common_params_model {
-    std::string path    = ""; // model local path                                           // NOLINT
-    std::string url     = ""; // model url to download                                      // NOLINT
-    std::string hf_repo = ""; // HF repo                                                    // NOLINT
-    std::string hf_file = ""; // HF file                                                    // NOLINT
+    std::string path        = ""; // model local path                                       // NOLINT
+    std::string url         = ""; // model url to download                                  // NOLINT
+    std::string hf_repo     = ""; // HF repo                                                // NOLINT
+    std::string hf_file     = ""; // HF file                                                // NOLINT
+    std::string docker_repo = ""; // Docker repo                                            // NOLINT
 };

 struct common_params_speculative {
@@ -287,9 +288,9 @@ struct common_params {
    float   rope_freq_base        =  0.0f; // RoPE base frequency
    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
    float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
+    float   yarn_attn_factor      = -1.0f; // YaRN magnitude scaling factor
+    float   yarn_beta_fast        = -1.0f; // YaRN low correction dim
+    float   yarn_beta_slow        = -1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length

    // offload params
@@ -312,6 +313,7 @@ struct common_params {
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
+    enum llama_flash_attn_type   flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_AUTO; // whether to use Flash Attention

    struct common_params_sampling    sampling;
    struct common_params_speculative speculative;
@@ -375,7 +377,6 @@ struct common_params {
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
-    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
    bool ctx_shift         = false;  // context shift on infinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
@@ -444,7 +445,7 @@ struct common_params {

    // "advanced" endpoints are disabled by default for better security
    bool webui            = true;
-    bool endpoint_slots   = false;
+    bool endpoint_slots   = true;
    bool endpoint_props   = false; // only control POST requests, not GET
    bool endpoint_metrics = false;

@@ -452,7 +453,7 @@ struct common_params {

    std::string slot_save_path;

-    float slot_prompt_similarity = 0.5f;
+    float slot_prompt_similarity = 0.1f;

    // batched-bench params
    bool is_pp_shared = false;
@@ -733,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 }

+//
+// MoE utils
+//
+
+const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
+
+static std::string llm_ffn_exps_block_regex(int idx) {
+    return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
+}
+
+static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
+    return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
+}
+
 //
 // training utils
 //
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
 };

 static bool is_reserved_name(const std::string & name) {
-    static std::unordered_set<std::string> RESERVED_NAMES;
-    if (RESERVED_NAMES.empty()) {
-        RESERVED_NAMES.insert("root");
-        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
-    }
+    static const std::unordered_set<std::string> RESERVED_NAMES = [] {
+        std::unordered_set<std::string> s;
+        s.insert("root");
+        for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
+        for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
+        return s;
+    }();
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }

@@ -843,9 +844,10 @@ public:
                _build_object_rule(
                    properties, required, name,
                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
-        } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
+        } else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
            std::unordered_set<std::string> required;
            std::vector<std::pair<std::string, json>> properties;
+            std::map<std::string, size_t> enum_values;
            std::string hybrid_name = name;
            std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
                if (comp_schema.contains("$ref")) {
@@ -857,6 +859,14 @@ public:
                            required.insert(prop.key());
                        }
                    }
+                } else if (comp_schema.contains("enum")) {
+                    for (const auto & v : comp_schema["enum"]) {
+                        const auto rule = _generate_constant_rule(v);
+                        if (enum_values.find(rule) == enum_values.end()) {
+                            enum_values[rule] = 0;
+                        }
+                        enum_values[rule] += 1;
+                    }
                } else {
                  // todo warning
                }
@@ -870,6 +880,17 @@ public:
                    add_component(t, true);
                }
            }
+            if (!enum_values.empty()) {
+                std::vector<std::string> enum_intersection;
+                for (const auto & p : enum_values) {
+                    if (p.second == schema["allOf"].size()) {
+                        enum_intersection.push_back(p.first);
+                    }
+                }
+                if (!enum_intersection.empty()) {
+                    return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
+                }
+            }
            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
        } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
            json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -4,17 +4,52 @@
 #include <condition_variable>
 #include <cstdarg>
 #include <cstdio>
+#include <cstdlib>
+#include <cstring>
 #include <mutex>
 #include <sstream>
 #include <thread>
 #include <vector>

+#if defined(_WIN32)
+#    include <io.h>
+#    include <windows.h>
+#    define isatty _isatty
+#    define fileno _fileno
+#else
+#    include <unistd.h>
+#endif // defined(_WIN32)
+
 int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;

 void common_log_set_verbosity_thold(int verbosity) {
    common_log_verbosity_thold = verbosity;
 }

+// Auto-detect if colors should be enabled based on terminal and environment
+static bool common_log_should_use_colors_auto() {
+    // Check NO_COLOR environment variable (https://no-color.org/)
+    if (const char * no_color = std::getenv("NO_COLOR")) {
+        if (no_color[0] != '\0') {
+            return false;
+        }
+    }
+
+    // Check TERM environment variable
+    if (const char * term = std::getenv("TERM")) {
+        if (std::strcmp(term, "dumb") == 0) {
+            return false;
+        }
+    }
+
+    // Check if stdout and stderr are connected to a terminal
+    // We check both because log messages can go to either
+    bool stdout_is_tty = isatty(fileno(stdout));
+    bool stderr_is_tty = isatty(fileno(stderr));
+
+    return stdout_is_tty || stderr_is_tty;
+}
+
 static int64_t t_us() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
 }
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {

 struct common_log * common_log_main() {
    static struct common_log log;
+    static std::once_flag    init_flag;
+    std::call_once(init_flag, [&]() {
+        // Set default to auto-detect colors
+        log.set_colors(common_log_should_use_colors_auto());
+    });

    return &log;
 }
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
    log->set_file(file);
 }

-void common_log_set_colors(struct common_log * log, bool colors) {
-    log->set_colors(colors);
+void common_log_set_colors(struct common_log * log, log_colors colors) {
+    if (colors == LOG_COLORS_AUTO) {
+        log->set_colors(common_log_should_use_colors_auto());
+        return;
+    }
+
+    if (colors == LOG_COLORS_DISABLED) {
+        log->set_colors(false);
+        return;
+    }
+
+    GGML_ASSERT(colors == LOG_COLORS_ENABLED);
+    log->set_colors(true);
 }

 void common_log_set_prefix(struct common_log * log, bool prefix) {
--- a/common/log.h
+++ b/common/log.h
@@ -24,6 +24,12 @@
 #define LOG_DEFAULT_DEBUG 1
 #define LOG_DEFAULT_LLAMA 0

+enum log_colors {
+    LOG_COLORS_AUTO     = -1,
+    LOG_COLORS_DISABLED = 0,
+    LOG_COLORS_ENABLED  = 1,
+};
+
 // needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
 // set via common_log_set_verbosity()
 extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
 // D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
 //

-void common_log_set_file      (struct common_log * log, const char * file);       // not thread-safe
-void common_log_set_colors    (struct common_log * log,       bool   colors);     // not thread-safe
-void common_log_set_prefix    (struct common_log * log,       bool   prefix);     // whether to output prefix to each log
-void common_log_set_timestamps(struct common_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+void common_log_set_file      (struct common_log * log, const char * file); // not thread-safe
+void common_log_set_colors    (struct common_log * log, log_colors colors); // not thread-safe
+void common_log_set_prefix    (struct common_log * log, bool prefix);       // whether to output prefix to each log
+void common_log_set_timestamps(struct common_log * log, bool timestamps);   // whether to output timestamps in the prefix

 // helper macros for logging
 // use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -426,8 +426,29 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {

 // helpers

-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
-    return &gsmpl->cur_p;
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
+    auto * res = &gsmpl->cur_p;
+
+    if (do_sort && !res->sorted) {
+        // remember the selected token before sorting
+        const llama_token id = res->data[res->selected].id;
+
+        std::sort(res->data, res->data + res->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.p > b.p;
+        });
+
+        // restore the selected token after sorting
+        for (size_t i = 0; i < res->size; ++i) {
+            if (res->data[i].id == id) {
+                res->selected = i;
+                break;
+            }
+        }
+
+        res->sorted = true;
+    }
+
+    return res;
 }

 llama_token common_sampler_last(const struct common_sampler * gsmpl) {
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -86,7 +86,9 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 // helpers

 // access the internal list of current candidate tokens
-llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+// if do_sort == true, the candidates are guaranteed to be sorted afterwards (in descending order of probability)
+// the .sorted flag of the result indicates whether the returned candidates are sorted
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort);

 // get the last accepted token
 llama_token common_sampler_last(const struct common_sampler * gsmpl);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -317,7 +317,7 @@ llama_tokens common_speculative_gen_draft(

        common_sampler_sample(smpl, ctx_dft, 0, true);

-        const auto * cur_p = common_sampler_get_candidates(smpl);
+        const auto * cur_p = common_sampler_get_candidates(smpl, true);

        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -302,10 +302,6 @@ class ModelBase:
                # data = data_torch.squeeze().numpy()
                data = data_torch.numpy()

-                # if data ends up empty, it means data_torch was a scalar tensor -> restore
-                if len(data.shape) == 0:
-                    data = data_torch.numpy()
-
                n_dims = len(data.shape)
                data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)

@@ -739,6 +735,9 @@ class TextModel(ModelBase):
        if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
            # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
            res = "qwen2"
+        if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
+            # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
+            res = "grok-2"
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
@@ -889,6 +888,9 @@ class TextModel(ModelBase):
        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
            res = "mellum"
+        if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
+            # ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
+            res = "llada-moe"

        if res is None:
            logger.warning("\n")
@@ -2391,7 +2393,10 @@ class SmolVLMModel(MmprojModel):
        return [] # skip other tensors


-@ModelBase.register("Llama4ForConditionalGeneration")
+@ModelBase.register(
+    "Llama4ForConditionalGeneration",
+    "Llama4ForCausalLM",
+)
 class Llama4Model(LlamaModel):
    model_arch = gguf.MODEL_ARCH.LLAMA4
    undo_permute = False
@@ -2409,6 +2414,10 @@ class Llama4Model(LlamaModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])
+        if "layer_types" in self.hparams:
+            if all(lt == "full_attention" for lt in self.hparams["layer_types"]):
+                # all layers are full attention (for MobileLLM), disable swa
+                self.gguf_writer.add_sliding_window(0)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
        if name.startswith("language_model."):
@@ -2686,12 +2695,20 @@ class BitnetModel(TextModel):
        yield (new_name, data_torch)


-@ModelBase.register("GrokForCausalLM")
+@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
 class GrokModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GROK

    def set_vocab(self):
-        self._set_vocab_sentencepiece()
+        if (self.dir_model / 'tokenizer.model').is_file():
+            self._set_vocab_sentencepiece()
+            return
+
+        if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
+            logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
+            sys.exit(1)
+
+        self._set_vocab_gpt2()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@@ -2699,11 +2716,46 @@ class GrokModel(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()

-    _experts: list[dict[str, Tensor]] | None = None
+        self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
+        self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
+        if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
+            self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
+
+        if (rope_dim := self.hparams.get("head_dim")) is None:
+            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+
+        if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
+
+        # Treat "original" as "yarn", seems to have been a mistake
+        if self.hparams.get("rope_type") in ("yarn", "original"):
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
+            self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
+            self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
+            self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
+            self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
+
+        if temp_len := self.hparams.get("attn_temperature_len"):
+            self.gguf_writer.add_attn_temperature_length(temp_len)
+
+        self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
+        self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
+        self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
+
+    _experts: list[dict[str, list[Tensor]]] | None = None
+    _cur_expert = ""

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        tensors: list[tuple[str, Tensor]] = []
+        is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
+
+        if not is_expert:
+            tensors.append((self.map_tensor_name(name), data_torch))
+
        # process the experts separately
-        if name.find(".moe.") != -1:
+        if is_expert or self._cur_expert:
            n_experts = self.hparams["num_local_experts"]

            assert bid is not None
@@ -2711,32 +2763,41 @@ class GrokModel(TextModel):
            if self._experts is None:
                self._experts = [{} for _ in range(self.block_count)]

-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                tensors: list[tuple[str, Tensor]] = []
-
-                # merge the experts into a single 3d tensor
-                for wid in ["linear", "linear_1", "linear_v"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-                return tensors
-            else:
+            # concatenate split tensors
+            if name in self._experts[bid]:
+                self._cur_expert = name
+                self._experts[bid][name].append(data_torch)
                return []
+            elif is_expert:
+                self._cur_expert = name
+                self._experts[bid][name] = [data_torch]
+                return []
+            else:
+                self._cur_expert = ""

-        return [(self.map_tensor_name(name), data_torch)]
+            for bid in range(self.block_count):
+                if len(self._experts[bid]) >= n_experts * 3:
+                    # merge the experts into a single 3d tensor
+                    for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
+                        datas: list[Tensor] = []
+
+                        for xid in range(n_experts):
+                            ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
+                            if ename not in self._experts[bid]:
+                                ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
+                            tensor_list = self._experts[bid][ename]
+                            datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
+                            del self._experts[bid][ename]
+
+                        data_torch = torch.stack(datas, dim=0)
+
+                        merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
+
+                        new_name = self.map_tensor_name(merged_name)
+
+                        yield (new_name, data_torch)
+
+        yield from tensors


@ModelBase.register("DbrxForCausalLM")
@@ -5126,6 +5187,29 @@ class Gemma3Model(TextModel):
        return [(self.map_tensor_name(name), data_torch)]


+@ModelBase.register("Gemma3TextModel")
+class EmbeddingGemma(Gemma3Model):
+    model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        # Override the sliding window size as it gets adjusted by the Gemma3TextConfig
+        # constructor. We want to use the value from the original model's config.json.
+        # ref: https://github.com/huggingface/transformers/pull/40700
+        with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
+            config = json.load(f)
+            orig_sliding_window = config.get("sliding_window")
+            if orig_sliding_window is None:
+                raise ValueError("sliding_window not found in model config - this is required for the model")
+
+            logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
+                        f"instead of {self.hparams['sliding_window']}")
+            self.gguf_writer.add_sliding_window(orig_sliding_window)
+
+        self._try_set_pooling_type()
+
+
@ModelBase.register("Gemma3ForConditionalGeneration")
 class Gemma3VisionModel(MmprojModel):
    def set_gguf_parameters(self):
@@ -5932,9 +6016,34 @@ class SeedOssModel(TextModel):


@ModelBase.register("Olmo2ForCausalLM")
+@ModelBase.register("Olmo3ForCausalLM")
 class Olmo2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.OLMO2

+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        rope_scaling = self.hparams.get("rope_scaling") or {}
+        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
+            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
+            self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
+            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
+
+        if "sliding_window" in self.hparams:
+            self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+
+            sliding_window_pattern = []
+            if "layer_types" in self.hparams:
+                sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
+            else:
+                # Olmo2 does not use sliding window attention.
+                # Olmo3 defaults to using sliding window for all layers except every 4th.
+                for i in range(self.hparams["num_hidden_layers"]):
+                    sliding_window_pattern.append((i + 1) % 4 != 0)
+
+            self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
+

@ModelBase.register("OlmoeForCausalLM")
 class OlmoeModel(TextModel):
@@ -6682,6 +6791,8 @@ class T5Model(TextModel):
        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
        self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
        self.gguf_writer.add_block_count(self.hparams["num_layers"])
+        if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
+            self.gguf_writer.add_decoder_block_count(dec_n_layer)
        self.gguf_writer.add_head_count(self.hparams["num_heads"])
        self.gguf_writer.add_key_length(self.hparams["d_kv"])
        self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -7545,6 +7656,21 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
            if i not in self._attn_layers
        ]

+        # There are some models in this family that are non-hybrid, but keep the
+        # same parent class by setting all layers to "attention." If this is the
+        # case, the model architecture needs to be updated to a standard
+        # "granite" or "granitemoe" model
+        if not self._ssm_layers:
+            has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
+            new_arch = (
+                gguf.MODEL_ARCH.GRANITE_MOE
+                if has_experts else
+                gguf.MODEL_ARCH.GRANITE
+            )
+            self.model_arch = new_arch
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch]
+            self.gguf_writer.add_architecture()
+
        # n_group and d_inner are used during reshape_tensors for mamba2
        # NOTE: Explicitly include hparam prefix prefix for d_model to
        #   disambiguate with top-level head_dim
@@ -7629,8 +7755,11 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
            self.gguf_writer.add_rope_dimension_count(rope_dim)
        self.gguf_writer.add_head_count_kv(head_count_kv_vec)

-        ## If Bamba, use rope, otherwise don't
-        use_rope = "BambaForCausalLM" in self.hparams["architectures"]
+        ## If Bamba or non-hybrid, use rope, otherwise don't
+        use_rope = (
+            "BambaForCausalLM" in self.hparams["architectures"]
+            or not self._ssm_layers
+        )
        self.gguf_writer.add_rope_scaling_finetuned(use_rope)
        if not use_rope:
            self.gguf_writer.add_context_length(2**20)
@@ -8163,6 +8292,76 @@ class HunYuanMoEModel(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


+@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
+class LLaDAMoEModel(TextModel):
+    model_arch = gguf.MODEL_ARCH.LLADA_MOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+        if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
+
+        # number of experts used per token (top-k)
+        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+
+        self.gguf_writer.add_mask_token_id(156895)
+        self.gguf_writer.add_causal_attention(False)
+        self.gguf_writer.add_diffusion_shift_logits(False)
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    # Copied from: Qwen2MoeModel
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # Copied from: Qwen2MoeModel
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
@ModelBase.register("HunYuanDenseV1ForCausalLM")
 class HunYuanModel(TextModel):
    model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -139,6 +139,7 @@ models = [
    {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
    {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
+    {"name": "llada-moe",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -158,6 +159,7 @@ pre_computed_hashes = [
    {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
    {"name": "kimi-k2",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base",   "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
    {"name": "qwen2",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
+    {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
 ]


--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -12,7 +12,7 @@ import json
 from math import prod
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
-from transformers import AutoConfig
+from transformers import AutoConfig, AutoTokenizer

 import torch

@@ -26,6 +26,8 @@ import gguf
 # reuse model definitions from convert_hf_to_gguf.py
 from convert_hf_to_gguf import LazyTorchTensor, ModelBase

+from gguf.constants import GGUFValueType
+
 logger = logging.getLogger("lora-to-gguf")


@@ -369,7 +371,31 @@ if __name__ == '__main__':
                self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")

            def set_gguf_parameters(self):
+                logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
                self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
+                alora_invocation_tokens = lparams.get("alora_invocation_tokens")
+                invocation_string = lparams.get("invocation_string")
+                if invocation_string and not alora_invocation_tokens:
+                    logger.debug("Tokenizing invocation_string -> alora_invocation_tokens")
+                    base_model_path_or_id = hparams.get("_name_or_path")
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained(base_model_path_or_id)
+                    except ValueError:
+                        logger.error("Unable to load tokenizer from %s", base_model_path_or_id)
+                        raise
+                    # NOTE: There's an off-by-one with the older aLoRAs where
+                    # the invocation string includes the "<|start_of_turn|>"
+                    # token, but the adapters themselves were trained to
+                    # activate _after_ that first token, so we drop it here.
+                    alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
+                if alora_invocation_tokens:
+                    logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
+                    self.gguf_writer.add_key_value(
+                        gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS,
+                        alora_invocation_tokens,
+                        GGUFValueType.ARRAY,
+                        GGUFValueType.UINT32,
+                    )

            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr

 ## Environment variable setup

-### GGML_CANN_ASYNC_MODE
-
-Enables asynchronous operator submission. Disabled by default.
-
 ### GGML_CANN_MEM_POOL

-Specifies the memory pool management strategy:
+Specifies the memory pool management strategy, Default is vmm.

 - vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.

 - prio: Employs a priority queue-based memory pool management.
+
 - leg: Uses a fixed-size buffer pool.

 ### GGML_CANN_DISABLE_BUF_POOL_CLEAN
@@ -312,5 +309,16 @@ Controls automatic cleanup of the memory pool. This option is only effective whe

 ### GGML_CANN_WEIGHT_NZ

-Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
+Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.

+### GGML_CANN_ACL_GRAPH
+
+Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
+
+### GGML_CANN_GRAPH_CACHE_CAPACITY
+
+Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted.
+
+### GGML_CANN_PREFILL_USE_GRAPH
+
+Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled.
--- a/docs/backend/zDNN.md
+++ b/docs/backend/zDNN.md
@@ -0,0 +1,61 @@
+# llama.cpp for IBM zDNN Accelerator
+
+## Background
+
+IBM zDNN (Z Deep Neural Network) is a hardware acceleration library designed specifically to leverage the IBM NNPA (Neural Network Processor Assist) accelerator located within IBM Telum I and II processors. It provides significant performance improvements for neural network inference operations.
+
+### Llama.cpp + IBM zDNN
+
+The llama.cpp zDNN backend is designed to enable llama.cpp on IBM z17 and later systems via the IBM zDNN hardware acceleration library.
+
+## Software & Hardware Support
+
+| Hardware Level       | Status        | Verified                   |
+| -------------------- | ------------- | -------------------------- |
+| IBM z17 / LinuxONE 5 | Supported     | RHEL 9.6, IBM z17, 40 IFLs |
+| IBM z16 / LinuxONE 4 | Not Supported |                            |
+
+## Data Types Supported
+
+| Data Type | Status    |
+| --------- | --------- |
+| F32       | Supported |
+| F16       | Supported |
+| BF16      | Supported |
+
+## CMake Options
+
+The IBM zDNN backend has the following CMake options that control the behaviour of the backend.
+
+| CMake Option | Default Value | Description                         |
+| ------------ | ------------- | ----------------------------------- |
+| `GGML_ZDNN`  | `OFF`         | Compile llama.cpp with zDNN support |
+| `ZDNN_ROOT`  | `""`          | Override zDNN library lookup        |
+
+## 1. Install zDNN Library
+
+Note: Using the zDNN library provided via `apt` or `yum` may not work correctly as reported in [#15772](https://github.com/ggml-org/llama.cpp/issues/15772). It is preferred that you compile from source.
+
+```sh
+git clone --recurse-submodules https://github.com/IBM/zDNN
+cd zDNN
+
+autoreconf .
+./configure --prefix=/opt/zdnn-libs
+
+make build
+sudo make install
+```
+
+## 2. Build llama.cpp
+
+```sh
+git clone https://github.com/ggml-org/llama.cpp
+cd llama.cpp
+
+cmake -S . -G Ninja -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGGML_ZDNN=ON \
+    -DZDNN_ROOT=/opt/zdnn-libs
+cmake --build build --config Release -j$(nproc)
+```
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -42,18 +42,6 @@ cmake --build build --config Release -j $(nproc)
    cmake --build build --config Release -j $(nproc)
    ```

-   By default, NNPA is disabled by default. To enable it:
-
-    ```bash
-    cmake -S . -B build             \
-        -DCMAKE_BUILD_TYPE=Release  \
-        -DGGML_BLAS=ON              \
-        -DGGML_BLAS_VENDOR=OpenBLAS \
-        -DGGML_NNPA=ON
-
-    cmake --build build --config Release -j $(nproc)
-    ```
-
 -   For debug builds:

    ```bash
@@ -164,15 +152,11 @@ All models need to be converted to Big-Endian. You can achieve this in three cas

 Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.

-### 2. NNPA Vector Intrinsics Acceleration
-
-Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
-
-### 3. zDNN Accelerator (WIP)
+### 2. zDNN Accelerator (WIP)

 Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.

-### 4. Spyre Accelerator
+### 3. Spyre Accelerator

 _Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._

@@ -230,10 +214,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
    CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
    ```

-5. `-DGGML_NNPA=ON` generates gibberish output
-
-    Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
-
 ## Getting Help on IBM Z & LinuxONE

 1. **Bugs, Feature Requests**
@@ -258,38 +238,38 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl

 ## Appendix B: SIMD Support Matrix

-|            | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
-| ---------- | ----------- | ---- | ---- | ----- |
-| FP32       | ✅          | ✅   | ✅   | ❓    |
-| FP16       | ✅          | ✅   | ❓   | ❓    |
-| BF16       | 🚫          | 🚫   | ❓   | ❓    |
-| Q4_0       | ✅          | ✅   | ❓   | ❓    |
-| Q4_1       | ✅          | ✅   | ❓   | ❓    |
-| MXFP4      | 🚫          | 🚫   | ❓   | ❓    |
-| Q5_0       | ✅          | ✅   | ❓   | ❓    |
-| Q5_1       | ✅          | ✅   | ❓   | ❓    |
-| Q8_0       | ✅          | ✅   | ❓   | ❓    |
-| Q2_K       | 🚫          | 🚫   | ❓   | ❓    |
-| Q3_K       | ✅          | ✅   | ❓   | ❓    |
-| Q4_K       | ✅          | ✅   | ❓   | ❓    |
-| Q5_K       | ✅          | ✅   | ❓   | ❓    |
-| Q6_K       | ✅          | ✅   | ❓   | ❓    |
-| TQ1_0      | 🚫          | 🚫   | ❓   | ❓    |
-| TQ2_0      | 🚫          | 🚫   | ❓   | ❓    |
-| IQ2_XXS    | 🚫          | 🚫   | ❓   | ❓    |
-| IQ2_XS     | 🚫          | 🚫   | ❓   | ❓    |
-| IQ2_S      | 🚫          | 🚫   | ❓   | ❓    |
-| IQ3_XXS    | 🚫          | 🚫   | ❓   | ❓    |
-| IQ3_S      | 🚫          | 🚫   | ❓   | ❓    |
-| IQ1_S      | 🚫          | 🚫   | ❓   | ❓    |
-| IQ1_M      | 🚫          | 🚫   | ❓   | ❓    |
-| IQ4_NL     | ✅          | ✅   | ❓   | ❓    |
-| IQ4_XS     | ✅          | ✅   | ❓   | ❓    |
-| FP32->FP16 | 🚫          | ✅   | ❓   | ❓    |
-| FP16->FP32 | 🚫          | ✅   | ❓   | ❓    |
+|            | VX/VXE/VXE2 | zDNN | Spyre |
+|------------|-------------|------|-------|
+| FP32       | ✅           | ✅    | ❓     |
+| FP16       | ✅           | ✅    | ❓     |
+| BF16       | 🚫           | ✅    | ❓     |
+| Q4_0       | ✅           | ❓    | ❓     |
+| Q4_1       | ✅           | ❓    | ❓     |
+| MXFP4      | 🚫           | ❓    | ❓     |
+| Q5_0       | ✅           | ❓    | ❓     |
+| Q5_1       | ✅           | ❓    | ❓     |
+| Q8_0       | ✅           | ❓    | ❓     |
+| Q2_K       | 🚫           | ❓    | ❓     |
+| Q3_K       | ✅           | ❓    | ❓     |
+| Q4_K       | ✅           | ❓    | ❓     |
+| Q5_K       | ✅           | ❓    | ❓     |
+| Q6_K       | ✅           | ❓    | ❓     |
+| TQ1_0      | 🚫           | ❓    | ❓     |
+| TQ2_0      | 🚫           | ❓    | ❓     |
+| IQ2_XXS    | 🚫           | ❓    | ❓     |
+| IQ2_XS     | 🚫           | ❓    | ❓     |
+| IQ2_S      | 🚫           | ❓    | ❓     |
+| IQ3_XXS    | 🚫           | ❓    | ❓     |
+| IQ3_S      | 🚫           | ❓    | ❓     |
+| IQ1_S      | 🚫           | ❓    | ❓     |
+| IQ1_M      | 🚫           | ❓    | ❓     |
+| IQ4_NL     | ✅           | ❓    | ❓     |
+| IQ4_XS     | ✅           | ❓    | ❓     |
+| FP32->FP16 | 🚫           | ❓    | ❓     |
+| FP16->FP32 | 🚫           | ❓    | ❓     |

 -   ✅ - acceleration available
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself

-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
--- a/docs/build.md
+++ b/docs/build.md
@@ -59,8 +59,6 @@ cmake --build build --config Release
    cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
    cmake --build build-arm64-windows-llvm-release
    ```
-    Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
-
    For building with ninja generator and clang compiler as default:
      -set path:set LIB=C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\um\x64;C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.41.34120\lib\x64\uwp;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22621.0\ucrt\x64
      ```bash
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -18,6 +18,7 @@ Legend:
 |                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
 |                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -26,6 +27,7 @@ Legend:
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
@@ -49,9 +51,11 @@ Legend:
 |                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
@@ -61,7 +65,9 @@ Legend:
 |                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
 |                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
 |                              PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
@@ -98,6 +104,7 @@ Legend:
 |                              SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
 |                         SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,7 +20,6 @@ else()

    add_subdirectory(gguf-hash)
    add_subdirectory(gguf)
-    add_subdirectory(gritlm)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(parallel)
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
-USER_NAME="${USER_NAME:-Anon}"
-
-# Uncomment and adjust to the number of CPU cores you want to use.
-#N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
-N_PREDICTS="${N_PREDICTS:-4096}"
-
-GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
-
-if [ -n "$N_THREAD" ]; then
-    GEN_OPTIONS+=(--threads "$N_THREAD")
-fi
-
-./llama-cli "${GEN_OPTIONS[@]}" \
-    --model "$MODEL" \
-    --in-prefix " " \
-    --in-suffix "${AI_NAME}:" \
-    --n_predict "$N_PREDICTS" \
-    --color --interactive \
-    --reverse-prompt "${USER_NAME}:" \
-    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
-${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
-${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
-${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
-${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}.
-The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
-${AI_NAME} can only communicate through text, so she can't send images or videos.
-
-
-${USER_NAME}: Hello!
-${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
-${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
-${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
-${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
-${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
-${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
-${AI_NAME}: What do you like to do in your free time? ^_^
-${USER_NAME}:" "$@"
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -1,57 +0,0 @@
-@setlocal disabledelayedexpansion enableextensions
-@echo off
-
-cd /d "%~dp0.."
-if not "%errorlevel%"=="0" (
-    echo Unable to change directory.
-    pause
-    exit /b 1
-)
-
-if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
-if not defined USER_NAME set "USER_NAME=User"
-if not defined AI_NAME set "AI_NAME=ChatLLaMa"
-rem Adjust to the number of CPU cores you want to use.
-rem if not defined N_THREAD set "N_THREAD=8"
-rem Number of tokens to predict (made it larger than default because we want a long interaction)
-if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
-
-rem Default main script paths
-set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
-
-rem Get main script path from command line arguments
-set "MAIN_SCRIPT_PATH=%~1"
-
-rem If the main script path was not specified, try the default paths
-if not defined MAIN_SCRIPT_PATH (
-    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
-        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
-    )
-)
-
-rem If the main script path was not found, tell the user how to specify it
-if not defined MAIN_SCRIPT_PATH (
-    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
-    echo %DEFAULT_MAIN_SCRIPT_PATHS%
-    pause
-    exit /b 1
-)
-
-rem Default context, feel free to edit it
-set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
-
-rem Set a temporary variable if N_THREAD is set
-if defined N_THREAD (
-    set "_N_THREAD=--threads %N_THREAD%"
-) else (
-    set "_N_THREAD="
-)
-
-rem Run the script
-echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
-  --model "%MODEL%" ^
-  --n_predict %N_PREDICTS% ^
-  --color --interactive ^
-  --reverse-prompt "%USER_NAME%:" ^
-  --prompt "%PROMPT_TEXT%"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="${USER_NAME:-USER}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "${USER_NAME}:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -1,149 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-cd "$(dirname "$0")/.." || exit
-
-if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
-    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
-    exit 1
-fi
-
-MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
-PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
-USER_NAME="${USER_NAME:-User}"
-AI_NAME="${AI_NAME:-ChatLLaMa}"
-DATE_TIME="$(date +%H:%M)"
-DATE_YEAR="$(date +%Y)"
-
-LOG="${CHAT_SAVE_DIR}/main.log"
-LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
-CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
-CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
-NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
-NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
-
-SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
-'|'\
-'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
-SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
-
-CTX_SIZE=2048
-CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
-OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
-
-# An unbuffered `tail -c+N`
-skip_bytes() {
-    LANG=C IFS= read -r -n "$1" -d '' c
-    while LANG=C IFS= read -r -n 1 -d '' c; do
-        printf '%s' "$c"
-    done
-}
-
-mkdir -p "$CHAT_SAVE_DIR"
-echo >"$LOG"
-trap "tail -n100 ${LOG}" EXIT
-
-if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
-    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
-        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
-        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
-        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
-        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
-fi
-
-if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
-    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-fi
-
-if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
-    echo '...' >>"$NEXT_PROMPT_FILE"
-fi
-
-if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
-    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 64 here for better user feedback during initial prompt processing
-    ./llama-cli 2>>"$LOG" \
-        --batch_size 64 \
-        "${OPTS[@]}" \
-        --prompt-cache "$PROMPT_CACHE_FILE" \
-        --file "$CUR_PROMPT_FILE" \
-        --n_predict 1
-    echo
-    echo 'Done!'
-fi
-
-if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
-fi
-if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
-    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-fi
-
-printf '%s ' "$(< "$CUR_PROMPT_FILE")"
-n_tokens=0
-
-while read -e line; do
-    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
-    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
-
-    # Swap prompts when we're about to run out of context
-    if ((n_predict <= 0)); then
-        wait # for background main (below) to finish with next prompt
-        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
-        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
-
-        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
-        echo '...' >>"$NEXT_PROMPT_FILE"
-        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
-
-        n_tokens=0
-        n_predict=$((CTX_SIZE / 2))
-    fi
-
-    echo " ${line}" >>"$CUR_PROMPT_FILE"
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        echo " ${line}" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
-
-    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
-
-    ./llama-cli 2>>"$LOG" "${OPTS[@]}" \
-            --prompt-cache "$CUR_PROMPT_CACHE" \
-            --prompt-cache-all \
-            --file "$CUR_PROMPT_FILE" \
-            --reverse-prompt "${USER_NAME}:" \
-            --n_predict "$n_predict" |
-        skip_bytes 1 |                  # skip BOS token added by ./llama-cli
-        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
-        skip_bytes "$n_prompt_len_pre"  # print generation
-
-    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
-
-    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
-    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
-        printf '\n%s:' "$USER_NAME"
-        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
-    fi
-
-    printf ' '
-
-    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
-        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
-        exit 1
-    fi
-
-    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
-
-    if ((n_tokens > CTX_ROTATE_POINT)); then
-        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
-    fi
-
-    # Update cache for next prompt in background, ideally during user input
-    ./llama-cli >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
-          --prompt-cache "$NEXT_PROMPT_CACHE" \
-          --file "$NEXT_PROMPT_FILE" \
-          --n_predict 1 &
-done
--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-cd "$(dirname "$0")/.." || exit
-
-MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
-PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
-USER_NAME="### Human"
-AI_NAME="### Assistant"
-
-# Adjust to the number of CPU cores you want to use.
-N_THREAD="${N_THREAD:-8}"
-# Number of tokens to predict (made it larger than default because we want a long interaction)
-N_PREDICTS="${N_PREDICTS:-2048}"
-
-# Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
-
-DATE_TIME=$(date +%H:%M)
-DATE_YEAR=$(date +%Y)
-
-PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
-
-sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
-     $PROMPT_TEMPLATE > $PROMPT_FILE
-
-# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
-./bin/llama-cli $GEN_OPTIONS \
-  --model "$MODEL" \
-  --threads "$N_THREAD" \
-  --n_predict "$N_PREDICTS" \
-  --color --interactive \
-  --file ${PROMPT_FILE} \
-  --reverse-prompt "### Human:" \
-  --in-prefix ' ' \
-  "$@"
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-# Important:
-#
-#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
-#
-./llama-cli -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
-    --repeat_penalty 1.0 --color -i \
-    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -333,17 +333,17 @@ static void print_params(struct my_llama_hparams * params) {
 }

 static void print_tensor_info(const struct ggml_context * ctx) {
-    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+    for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
        LOG_INF("%s: Allocating ", __func__);
        int64_t total = 1;
        int i = 0;
        for (; i < ggml_n_dims(t); ++i) {
-            if (i > 0) LOG("x ");
-            LOG("[%" PRId64 "] ", t->ne[i]);
+            if (i > 0) { LOG_INF("x "); }
+            LOG_INF("[%" PRId64 "] ", t->ne[i]);
            total *= t->ne[i];
        }
-        if (i > 1) LOG("= [%" PRId64 "] ", total);
-        LOG("float space for %s\n", ggml_get_name(t));
+        if (i > 1) { LOG_INF("= [%" PRId64 "] ", total); }
+        LOG_INF("float space for %s\n", ggml_get_name(t));
    }
 }

--- a/examples/diffusion/diffusion-cli.cpp
+++ b/examples/diffusion/diffusion-cli.cpp
@@ -510,19 +510,27 @@ static void diffusion_generate(llama_context *          ctx,
    n_generated = params.max_length;
 }

-static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
+static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
    if (!use_chat_template) {
        return prompt;
    }

    auto chat_templates = common_chat_templates_init(model, "");
-
    common_chat_templates_inputs inputs;
-    common_chat_msg              user_msg;
-    user_msg.role                = "user";
-    user_msg.content             = prompt;
-    inputs.add_generation_prompt = true;
+    common_chat_msg system_msg;
+
+    if (!system_prompt.empty()) {
+        system_msg.role = "system";
+        system_msg.content = system_prompt;
+        inputs.messages.push_back(system_msg);
+    }
+
+    common_chat_msg user_msg;
+    user_msg.role = "user";
+    user_msg.content = prompt;
+
    inputs.messages.push_back(user_msg);
+    inputs.add_generation_prompt = true;

    auto result = common_chat_templates_apply(chat_templates.get(), inputs);

@@ -564,7 +572,7 @@ int main(int argc, char ** argv) {
    ctx_params.n_ctx                = params.n_ctx;
    ctx_params.n_batch              = params.n_batch;
    ctx_params.n_ubatch             = params.n_ubatch;
-    ctx_params.flash_attn           = params.flash_attn;
+    ctx_params.flash_attn_type      = params.flash_attn_type;
    ctx_params.no_perf              = params.no_perf;
    ctx_params.type_k               = params.cache_type_k;
    ctx_params.type_v               = params.cache_type_v;
@@ -579,7 +587,8 @@ int main(int argc, char ** argv) {
    llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);

    const llama_vocab * vocab            = llama_model_get_vocab(model);
-    std::string         formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
+
+    std::string         formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model);

    std::vector<llama_token> input_tokens = common_tokenize(vocab,
                                                            formatted_prompt,
@@ -596,6 +605,7 @@ int main(int argc, char ** argv) {
    }

    llama_token mask_token_id = llama_vocab_mask(vocab);
+
    GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);

    bool visual_mode = params.diffusion.visual_mode;
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -43,8 +43,8 @@ The above command will output space-separated float values.
 | $"string"$   | |
 |--------------|-|
 | "\n"         | (default)
-| "<#embSep#>" | for exemple
-| "<#sep#>"    | other exemple
+| "<#embSep#>" | for example
+| "<#sep#>"    | other example

 ## examples
 ### Unix-based systems (Linux, macOS, etc.):
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -28,6 +28,15 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
    return str;
 }

+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
 static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
    float v;
@@ -43,6 +52,8 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t *
        v = (float) *(int16_t *) &data[i];
    } else if (type == GGML_TYPE_I8) {
        v = (float) *(int8_t *) &data[i];
+    } else if (type == GGML_TYPE_BF16) {
+        v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
    } else {
        GGML_ABORT("fatal error");
    }
--- a/examples/gritlm/CMakeLists.txt
+++ b/examples/gritlm/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET llama-gritlm)
-add_executable(${TARGET} gritlm.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -1,62 +0,0 @@
-## Generative Representational Instruction Tuning (GRIT) Example
-[gritlm] a model which can generate embeddings as well as "normal" text
-generation depending on the instructions in the prompt.
-
-* Paper: https://arxiv.org/pdf/2402.09906.pdf
-
-### Retrieval-Augmented Generation (RAG) use case
-One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
-that we take documents that we want to use as context, to ground the large
-language model (LLM), and we create token embeddings for them. We then store
-these token embeddings in a vector database.
-
-When we perform a query, prompt the LLM, we will first create token embeddings
-for the query and then search the vector database to retrieve the most
-similar vectors, and return those documents so they can be passed to the LLM as
-context. Then the query and the context will be passed to the LLM which will
-have to _again_ create token embeddings for the query. But because gritlm is used
-the first query can be cached and the second query tokenization generation does
-not have to be performed at all.
-
-### Running the example
-Download a Grit model:
-```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
-```
-
-Run the example using the downloaded model:
-```console
-$ ./llama-gritlm -m models/gritlm-7b_q4_1.gguf
-
-Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
-Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
-Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
-Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
-
-Oh, brave adventurer, who dared to climb
-The lofty peak of Mt. Fuji in the night,
-When shadows lurk and ghosts do roam,
-And darkness reigns, a fearsome sight.
-
-Thou didst set out, with heart aglow,
-To conquer this mountain, so high,
-And reach the summit, where the stars do glow,
-And the moon shines bright, up in the sky.
-
-Through the mist and fog, thou didst press on,
-With steadfast courage, and a steadfast will,
-Through the darkness, thou didst not be gone,
-But didst climb on, with a steadfast skill.
-
-At last, thou didst reach the summit's crest,
-And gazed upon the world below,
-And saw the beauty of the night's best,
-And felt the peace, that only nature knows.
-
-Oh, brave adventurer, who dared to climb
-The lofty peak of Mt. Fuji in the night,
-Thou art a hero, in the eyes of all,
-For thou didst conquer this mountain, so bright.
-```
-
-[gritlm]: https://github.com/ContextualAI/gritlm
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,231 +0,0 @@
-#include "arg.h"
-#include "common.h"
-#include "llama.h"
-
-#include <string>
-#include <vector>
-
-// #define GRIT_DEBUG
-
-static std::vector<std::vector<float>> encode(llama_context * ctx, const std::vector<std::string> & sentences, const std::string & instruction) {
-    std::vector<std::vector<float>> result;
-
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
-
-    for (uint64_t i = 0; i < sentences.size(); i++) {
-        common_batch_clear(batch);
-
-        const std::string input_string = instruction + sentences[i];
-
-        std::vector<llama_token> inputs = common_tokenize(vocab, input_string, true, false);
-
-        const int32_t n_toks = inputs.size();
-
-        // GritLM seems to have EOS = ""
-        // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18
-        // inputs.push_back(llama_vocab_eos(vocab));
-
-        // we want to ignore instruction tokens for mean pooling
-        const int32_t n_inst = common_tokenize(vocab, instruction, true, false).size();
-
-#ifdef GRIT_DEBUG
-        // debug tokens - should be matching as referenced in the GritLM sample
-        std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) {
-            std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str());
-        });
-        std::printf("\n");
-#endif
-
-        // add input to batch (this increments n_tokens)
-        for (int32_t j = 0; j < n_toks; j++) {
-            common_batch_add(batch, inputs[j], j, { 0 }, true);
-        }
-
-        // clear previous kv_cache values (irrelevant for embeddings)
-        llama_memory_clear(llama_get_memory(ctx), true);
-        llama_set_causal_attn(ctx, false);
-
-        // run model
-        llama_decode(ctx, batch);
-
-        // get embedding dimensions
-        uint64_t n_embd = llama_model_n_embd(model);
-
-        // allocate embedding output
-        std::vector<float> emb_unorm(n_embd, 0.0f);
-
-        // sum up all token embeddings
-        for (int32_t k = n_inst; k < n_toks; k++) {
-            float * emb = llama_get_embeddings_ith(ctx, k);
-            for (uint64_t j = 0; j < n_embd; j++) {
-                emb_unorm[j] += emb[j];
-            }
-        }
-
-        // divide by number of tokens (mean pooling)
-        {
-            const uint64_t n_sent = n_toks - n_inst;
-
-            for (uint64_t j = 0; j < n_embd; j++) {
-                emb_unorm[j] /= n_sent;
-            }
-        }
-
-        std::vector<float> emb_norm(emb_unorm.size());
-        common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2);
-        result.push_back(emb_norm);
-
-#ifdef GRIT_DEBUG
-        // print out emb_norm
-        std::printf("embedding %ld: ", i);
-        for (uint64_t j = 0; j < n_embd; j++) {
-            std::printf("%.5f ", emb_norm[j]);
-        }
-        std::printf("\n\n");
-#endif
-    }
-
-    llama_batch_free(batch);
-
-    return result;
-}
-
-static std::string generate(llama_context * ctx, llama_sampler * smpl, const std::string & prompt, bool stream) {
-    std::string result;
-
-    const llama_model * model = llama_get_model(ctx);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    llama_token eos_token = llama_vocab_eos(vocab);
-
-    llama_memory_clear(llama_get_memory(ctx), true);
-    llama_set_causal_attn(ctx, true);
-
-    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
-
-    std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
-    int32_t i_current_token = 0;
-
-    while (true) {
-        common_batch_clear(bat);
-        {
-            const int32_t n_inputs = inputs.size();
-
-            for (int32_t i = 0; i < n_inputs; i++) {
-                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
-            }
-        }
-        inputs.clear();
-
-        llama_decode(ctx, bat);
-
-        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
-
-        if (token == eos_token) {
-            break;
-        }
-
-        std::string piece = common_token_to_piece(ctx, token);
-        if (stream) {
-            std::printf("%s", piece.c_str());
-            std::fflush(stdout);
-        }
-
-        inputs.push_back(token);
-
-        result += piece;
-    }
-
-    if (stream) {
-        std::printf("\n");
-    }
-
-    llama_batch_free(bat);
-
-    return result;
-}
-
-static std::string gritlm_instruction(const std::string & instruction) {
-    return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n";
-}
-
-int main(int argc, char * argv[]) {
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
-        return 1;
-    }
-
-    common_init();
-
-    llama_model_params mparams = common_model_params_to_llama(params);
-    llama_context_params cparams = common_context_params_to_llama(params);
-
-    cparams.embeddings = true;
-
-    llama_backend_init();
-
-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
-
-    // create generation context
-    llama_context * ctx = llama_init_from_model(model, cparams);
-
-    auto sparams = llama_sampler_chain_default_params();
-
-    sparams.no_perf = false;
-
-    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-
-    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
-
-    // ### Embedding/Representation ###
-    // samples taken from: https://github.com/ContextualAI/gritlm#basic
-    {
-        const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract";
-
-        const std::vector<std::string> queries = {
-            "Bitcoin: A Peer-to-Peer Electronic Cash System",
-            "Generative Representational Instruction Tuning",
-        };
-
-        const std::vector<std::string> documents = {
-            "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.",
-            "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.",
-        };
-
-        // No need to add instruction for retrieval documents
-        const std::vector<std::vector<float>> d_rep = encode(ctx, documents, gritlm_instruction(""));
-        const std::vector<std::vector<float>> q_rep = encode(ctx, queries,   gritlm_instruction(instruction));
-
-        const int n_embd = llama_model_n_embd(model);
-
-        const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
-        const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
-        const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
-
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0);
-        std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1);
-    }
-
-    llama_set_embeddings(ctx, false);
-
-    // ### Generation ###
-    // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction
-    {
-        const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n";
-        std::string response = generate(ctx, smpl, prompt, true);
-    }
-
-    llama_sampler_free(smpl);
-    llama_free(ctx);
-    llama_model_free(model);
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@@ -1,21 +0,0 @@
-# llama.cpp/example/jeopardy
-
-This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
-
-The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
-
-
-Step 1: Open jeopardy.sh and modify the following:
-```
-MODEL=(path to your model)
-MODEL_NAME=(name of your model)
-prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
-opts=(add -instruct here if needed for your model, or anything else you want to test out)
-```
-Step 2: Run `jeopardy.sh` from the llama.cpp folder
-
-Step 3: Repeat steps 1 and 2 until you have all the results you need.
-
-Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
-
-Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-import matplotlib.pyplot as plt
-import os
-import csv
-
-labels = []
-numbers = []
-numEntries = 1
-
-rows = []
-
-
-def bar_chart(numbers, labels, pos):
-    plt.bar(pos, numbers, color='blue')
-    plt.xticks(ticks=pos, labels=labels)
-    plt.title("Jeopardy Results by Model")
-    plt.xlabel("Model")
-    plt.ylabel("Questions Correct")
-    plt.show()
-
-
-def calculatecorrect():
-    directory = os.fsencode("./examples/jeopardy/results/")
-    csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
-    for row in csv_reader:
-        global rows
-        rows.append(row)
-    for listing in os.listdir(directory):
-        filename = os.fsdecode(listing)
-        if filename.endswith(".txt"):
-            file = open("./examples/jeopardy/results/" + filename, "rt")
-            global labels
-            global numEntries
-            global numbers
-            labels.append(filename[:-4])
-            numEntries += 1
-            i = 1
-            totalcorrect = 0
-            for line in file.readlines():
-                if line.strip() != "------":
-                    print(line)
-                else:
-                    print("Correct answer: " + rows[i][2] + "\n")
-                    i += 1
-                    print("Did the AI get the question right? (y/n)")
-                    if input() == "y":
-                        totalcorrect += 1
-            numbers.append(totalcorrect)
-
-
-if __name__ == '__main__':
-    calculatecorrect()
-    pos = list(range(numEntries))
-    labels.append("Human")
-    numbers.append(48.11)
-    bar_chart(numbers, labels, pos)
-    print(labels)
-    print(numbers)
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
-MODEL_NAME=Vicuna
-
-# exec options
-prefix="Human: " # Ex. Vicuna uses "Human: "
-opts="--temp 0 -n 80" # additional flags
-nl='
-'
-introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
-
-# file options
-question_file=./examples/jeopardy/questions.txt
-touch ./examples/jeopardy/results/$MODEL_NAME.txt
-output_file=./examples/jeopardy/results/$MODEL_NAME.txt
-
-counter=1
-
-echo 'Running'
-while IFS= read -r question
-do
-  exe_cmd="./llama-cli -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
-  echo $counter
-  echo "Current Question: $question"
-  eval "$exe_cmd"
-  echo -e "\n------" >> $output_file
-  counter=$((counter+1))
-done < "$question_file"
--- a/examples/jeopardy/qasheet.csv
+++ b/examples/jeopardy/qasheet.csv
@@ -1,103 +0,0 @@
-Index,Original Category,Original Correct Question,Model Prompt
-1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
-2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
-3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
-4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
-5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
-6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
-7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
-8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
-9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
-10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
-11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
-12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
-13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
-14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
-15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
-16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
-17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
-18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
-19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
-20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
-21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
-22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
-23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
-24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
-25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
-26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
-27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
-28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
-29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
-30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
-31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
-32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
-33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
-34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
-35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
-36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
-37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?"
-38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?"
-39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
-40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
-41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
-42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
-43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
-44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
-45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
-46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
-47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
-48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
-49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
-50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
-51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
-52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
-53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
-54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
-55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
-56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
-57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
-58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
-59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
-60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
-61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
-62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
-63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
-64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
-65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
-66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
-67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
-68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
-69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
-70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
-71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
-72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
-73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
-74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
-75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
-76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
-77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
-78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
-79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
-80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
-81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
-82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
-83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
-84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
-85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
-86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
-87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
-88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
-89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
-90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
-91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
-92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
-93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
-94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
-95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
-96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
-97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
-98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
-99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
-100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
-,,,
-TOTALS,,,
--- a/examples/jeopardy/questions.txt
+++ b/examples/jeopardy/questions.txt
@@ -1,100 +0,0 @@
-Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
-What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
-Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
-James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
-England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
-Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
-In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
-Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
-Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
-What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
-A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
-A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
-Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
-The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
-In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
-What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
-Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
-What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
-In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
-At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
-Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
-A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
-In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
-Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
-A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
-Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
-After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
-The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
-This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
-An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
-Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
-What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
-A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
-Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
-Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
-The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
-For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
-Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
-In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
-In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
-What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
-In 2010 who introduced the 4-point shot, 35 feet from the basket?
-Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
-A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
-In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
-Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
-In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
-Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
-This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
-1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
-Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
-Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
-Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
-5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
-Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
-The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
-Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
-Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
-In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
-At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
-Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
-Like Sir Thomas More, 3 16th century English queens are buried at what British location?
-In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
-The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
-What was first sold in 1908, at a price equivalent to about $27,000 today?
-The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
-The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
-In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
-In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
-Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
-After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
-Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
-Until a 1903 secession, what country's contiguous territory spanned 2 continents?
-Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
-Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
-Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
-Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
-Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
-Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
-In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
-In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
-In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
-The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
-The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
-Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
-What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
-What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
-Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
-Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
-Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
-The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
-A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
-Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
-A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
-In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
-Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
-Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
-The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
-What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
-Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -586,9 +586,10 @@ class SchemaConverter:
            properties = list(schema.get('properties', {}).items())
            return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))

-        elif schema_type in (None, 'object') and 'allOf' in schema:
+        elif schema_type in (None, 'object', 'string') and 'allOf' in schema:
            required = set()
            properties = []
+            enum_sets = []
            hybrid_name = name
            def add_component(comp_schema, is_required):
                if (ref := comp_schema.get('$ref')) is not None:
@@ -600,6 +601,9 @@ class SchemaConverter:
                        if is_required:
                            required.add(prop_name)

+                if 'enum' in comp_schema:
+                    enum_sets.append(set(comp_schema['enum']))
+
            for t in schema['allOf']:
                if 'anyOf' in t:
                    for tt in t['anyOf']:
@@ -607,6 +611,15 @@ class SchemaConverter:
                else:
                    add_component(t, is_required=True)

+            if enum_sets:
+                enum_intersection = enum_sets[0]
+                for s in enum_sets[1:]:
+                    enum_intersection &= s
+
+                if enum_intersection:
+                    rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
+                    return self._add_rule(rule_name, rule)
+
            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))

        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -1,28 +0,0 @@
-" Basic plugin example
-
-function! Llm()
-
-  let url = "http://127.0.0.1:8080/completion"
-
-  " Get the content of the current buffer
-  let buffer_content = join(getline(1, '$'), "\n")
-
-  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
-  let json_payload.prompt = buffer_content
-
-  " Define the curl command
-  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
-  let response = system(curl_command, json_encode(json_payload))
-
-  " Extract the content field from the response
-  let content = json_decode(response).content
-
-  let split_newlines = split(content, '\n', 1)
-
-  " Insert the content at the cursor position
-  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
-endfunction
-
-command! Llm call Llm()
-noremap <F2> :Llm<CR>
--- a/examples/model-conversion/Makefile
+++ b/examples/model-conversion/Makefile
@@ -63,7 +63,7 @@ causal-verify-logits: causal-run-original-model causal-run-converted-model
 	@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}

 causal-run-original-embeddings:
-	@./scripts/causal/run-casual-gen-embeddings-org.sh
+	@./scripts/causal/run-casual-gen-embeddings-org.py

 causal-run-converted-embeddings:
 	@./scripts/causal/run-converted-model-embeddings-logits.sh
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -105,12 +105,12 @@ new model, the model can be converted to GGUF format using the following command
 ### Inspecting the converted model
 The converted model can be inspected using the following command:
 ```console
-(venv) $ make inspect-converted-model
+(venv) $ make causal-inspect-converted-model
 ```

 ### Running the converted model
 ```console
-(venv) $ make run-converted-model
+(venv) $ make causal-run-converted-model
 ```

 ### Model logits verfication
--- a/examples/model-conversion/requirements.txt
+++ b/examples/model-conversion/requirements.txt
@@ -1,5 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.6.0
-torchvision~=0.21.0
-transformers~=4.55.0
-huggingface-hub~=0.34.0
+torch
+torchvision
+transformers
+huggingface-hub
+accelerate
--- a/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/compare-embeddings-logits.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/causal/compare-logits.py
+++ b/examples/model-conversion/scripts/causal/compare-logits.py
@@ -48,7 +48,7 @@ def main():
        print(f"Error: Model file not found: {model_path}")
        sys.exit(1)

-    model_name = os.path.splitext(os.path.basename(model_path))[0]
+    model_name = os.path.basename(model_path)
    data_dir = Path("data")

    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
+++ b/examples/model-conversion/scripts/causal/run-casual-gen-embeddings-org.py
@@ -3,11 +3,10 @@
 import argparse
 import os
 import importlib
-import sys
 import torch
 import numpy as np

-from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
 from pathlib import Path

 unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
@@ -43,6 +42,8 @@ if unreleased_model_name:
        model = model_class.from_pretrained(model_path)
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
+        print("Falling back to AutoModelForCausalLM")
+        model = AutoModelForCausalLM.from_pretrained(model_path)
 else:
    model = AutoModelForCausalLM.from_pretrained(model_path)
 print(f"Model class: {type(model)}")
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -9,15 +9,134 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 import torch
 import numpy as np

-unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
+### If you want to dump RoPE activations, apply this monkey patch to the model
+### class from Transformers that you are running (replace apertus.modeling_apertus
+### with the proper package and class for your model
+### === START ROPE DEBUG ===
+# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb

-parser = argparse.ArgumentParser(description='Process model with specified path')
-parser.add_argument('--model-path', '-m', help='Path to the model')
+# orig_rope = apply_rotary_pos_emb
+# torch.set_printoptions(threshold=float('inf'))
+# torch.set_printoptions(precision=6, sci_mode=False)
+
+# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+#     # log inputs
+#     summarize(q, "RoPE.q_in")
+#     summarize(k, "RoPE.k_in")
+
+#     # call original
+#     q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
+
+#     # log outputs
+#     summarize(q_out, "RoPE.q_out")
+#     summarize(k_out, "RoPE.k_out")
+
+#     return q_out, k_out
+
+# # Patch it
+# import transformers.models.apertus.modeling_apertus as apertus_mod  # noqa: E402
+# apertus_mod.apply_rotary_pos_emb = debug_rope
+### == END ROPE DEBUG ===
+
+
+def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
+    """
+    Print a tensor in llama.cpp debug style.
+
+    Supports:
+    - 2D tensors (seq, hidden)
+    - 3D tensors (batch, seq, hidden)
+    - 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
+
+    Shows first and last max_vals of each vector per sequence position.
+    """
+    t = tensor.detach().to(torch.float32).cpu()
+
+    # Determine dimensions
+    if t.ndim == 3:
+        _, s, _ = t.shape
+    elif t.ndim == 2:
+        _, s = 1, t.shape[0]
+        t = t.unsqueeze(0)
+    elif t.ndim == 4:
+        _, s, _, _ = t.shape
+    else:
+        print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
+        return
+
+    ten_shape = t.shape
+
+    print(f"ggml_debug: {name} = (f32)  ... = {{{ten_shape}}}")
+    print("                                     [")
+    print("                                      [")
+
+    # Determine indices for first and last sequences
+    first_indices = list(range(min(s, max_seq)))
+    last_indices = list(range(max(0, s - max_seq), s))
+
+    # Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
+    has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
+
+    # Combine indices
+    if has_overlap:
+        # If there's overlap, just use the combined unique indices
+        indices = sorted(list(set(first_indices + last_indices)))
+        separator_index = None
+    else:
+        # If no overlap, we'll add a separator between first and last sequences
+        indices = first_indices + last_indices
+        separator_index = len(first_indices)
+
+    for i, si in enumerate(indices):
+        # Add separator if needed
+        if separator_index is not None and i == separator_index:
+            print("                                       ...")
+
+        # Extract appropriate slice
+        vec = t[0, si]
+        if vec.ndim == 2:  # 4D case: flatten heads × dim_per_head
+            flat = vec.flatten().tolist()
+        else:  # 2D or 3D case
+            flat = vec.tolist()
+
+        # First and last slices
+        first = flat[:max_vals]
+        last = flat[-max_vals:] if len(flat) >= max_vals else flat
+        first_str = ", ".join(f"{v:12.4f}" for v in first)
+        last_str = ", ".join(f"{v:12.4f}" for v in last)
+
+        print(f"                                       [{first_str}, ..., {last_str}]")
+
+    print("                                      ],")
+    print("                                     ]")
+    print(f"                                     sum = {t.sum().item():.6f}\n")
+
+
+def debug_hook(name):
+    def fn(_m, input, output):
+        if isinstance(input, torch.Tensor):
+            summarize(input, name + "_in")
+        elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor):
+            summarize(input[0], name + "_in")
+        if isinstance(output, torch.Tensor):
+            summarize(output, name + "_out")
+        elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor):
+            summarize(output[0], name + "_out")
+
+    return fn
+
+
+unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
+
+parser = argparse.ArgumentParser(description="Process model with specified path")
+parser.add_argument("--model-path", "-m", help="Path to the model")
 args = parser.parse_args()

-model_path = os.environ.get('MODEL_PATH', args.model_path)
+model_path = os.environ.get("MODEL_PATH", args.model_path)
 if model_path is None:
-    parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
+    parser.error(
+        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
+    )

 config = AutoConfig.from_pretrained(model_path)

@@ -34,18 +153,30 @@ config = AutoConfig.from_pretrained(model_path)

 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
-    unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+    unreleased_module_path = (
+        f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
+    )
    class_name = f"{unreleased_model_name}ForCausalLM"
    print(f"Importing unreleased model module: {unreleased_module_path}")

    try:
-        model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
-        model = model_class.from_pretrained(model_path)  # Note: from_pretrained, not fromPretrained
+        model_class = getattr(
+            importlib.import_module(unreleased_module_path), class_name
+        )
+        model = model_class.from_pretrained(
+            model_path
+        )  # Note: from_pretrained, not fromPretrained
    except (ImportError, AttributeError) as e:
        print(f"Failed to import or load model: {e}")
        exit(1)
 else:
-    model = AutoModelForCausalLM.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, device_map="auto", offload_folder="offload"
+    )
+
+for name, module in model.named_modules():
+    if len(list(module.children())) == 0:  # only leaf modules
+        module.register_forward_hook(debug_hook(name))

 model_name = os.path.basename(model_path)
 # Printing the Model class to allow for easier debugging. This can be useful
@@ -62,7 +193,7 @@ print(f"Input text: {repr(prompt)}")
 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")

 with torch.no_grad():
-    outputs = model(input_ids)
+    outputs = model(input_ids.to(model.device))
    logits = outputs.logits

    # Extract logits for the last token (next token prediction)
--- a/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/embedding/compare-embeddings-logits.sh
@@ -1,4 +1,4 @@
-#/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/embedding/convert-model.sh
+++ b/examples/model-conversion/scripts/embedding/convert-model.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/embedding/modelcard.template
+++ b/examples/model-conversion/scripts/embedding/modelcard.template
@@ -7,7 +7,7 @@ base_model:
 Recommended way to run this model:

 ```sh
-llama-server -hf {namespace}/{model_name}-GGUF
+llama-server -hf {namespace}/{model_name}-GGUF --embeddings
 ```

 Then the endpoint can be accessed at http://localhost:8080/embedding, for
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/utils/check-nmse.py
+++ b/examples/model-conversion/scripts/utils/check-nmse.py
@@ -67,7 +67,7 @@ def main():
    parser.add_argument('-m', '--model-path', required=True,  help='Path to the model directory')
    args = parser.parse_args()

-    model_name = os.path.splitext(os.path.basename(args.model_path))[0]
+    model_name = os.path.basename(args.model_path)
    data_dir = Path("data")

    pytorch_file = data_dir / f"pytorch-{model_name}.bin"
--- a/examples/model-conversion/scripts/utils/create-collection-add-model.sh
+++ b/examples/model-conversion/scripts/utils/create-collection-add-model.sh
@@ -1,4 +1,6 @@

+#!/usr/bin/env bash
+
 COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
 echo "Created collection: $COLLECTION_SLUG"

--- a/examples/model-conversion/scripts/utils/curl-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/curl-embedding-server.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+curl --request POST \
+    --url http://localhost:8080/embedding \
+    --header "Content-Type: application/json" \
+    --data '{"input": "Hello world today"}' \
+    --silent
--- a/examples/model-conversion/scripts/utils/inspect-converted-model.sh
+++ b/examples/model-conversion/scripts/utils/inspect-converted-model.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
--- a/examples/model-conversion/scripts/utils/inspect-org-model.py
+++ b/examples/model-conversion/scripts/utils/inspect-org-model.py
@@ -40,7 +40,7 @@ if os.path.exists(index_path):
        file_path = os.path.join(model_path, file_name)
        print(f"\n--- From {file_name} ---")

-        with safe_open(file_path, framework="pt") as f:
+        with safe_open(file_path, framework="pt") as f:  # type: ignore
            for tensor_name in sorted(tensor_names):
                tensor = f.get_tensor(tensor_name)
                print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
@@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
    # Single file model (original behavior)
    print("Single-file model detected")

-    with safe_open(single_file_path, framework="pt") as f:
+    with safe_open(single_file_path, framework="pt") as f:  # type: ignore
        keys = f.keys()
        print("Tensors in model:")
        for key in sorted(keys):
--- a/examples/model-conversion/scripts/utils/perplexity-gen.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-gen.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run-simple.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/utils/perplexity-run.sh
+++ b/examples/model-conversion/scripts/utils/perplexity-run.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/utils/quantize.sh
+++ b/examples/model-conversion/scripts/utils/quantize.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e

--- a/examples/model-conversion/scripts/utils/run-embedding-server.sh
+++ b/examples/model-conversion/scripts/utils/run-embedding-server.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 set -e
 #
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -145,6 +145,20 @@ int main(int argc, char ** argv) {

    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());

+    if (llama_model_has_encoder(model)) {
+        if (llama_encode(ctx, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return 1;
+        }
+
+        llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
+        if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
+            decoder_start_token_id = llama_vocab_bos(vocab);
+        }
+
+        batch = llama_batch_get_one(&decoder_start_token_id, 1);
+    }
+
    // main loop

    const auto t_main_start = ggml_time_us();
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -244,7 +244,7 @@ int main(int argc, char ** argv) {
                    // stochastic verification
                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);

-                    auto & dist_tgt = *common_sampler_get_candidates(smpl);
+                    auto & dist_tgt = *common_sampler_get_candidates(smpl, true);

                    float p_tgt = 0.0f;
                    float p_dft = 0.0f;
@@ -493,7 +493,7 @@ int main(int argc, char ** argv) {

                common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);

-                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
+                const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl, true);

                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -1,5 +1,41 @@
 cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
-project("ggml" C CXX)
+project("ggml" C CXX ASM)
+
+### GGML Version
+set(GGML_VERSION_MAJOR 0)
+set(GGML_VERSION_MINOR 9)
+set(GGML_VERSION_PATCH 0)
+set(GGML_VERSION_DEV "-dev")  # "-dev" for development, "" for releases
+set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
+
+find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
+if(GIT_EXE)
+    # Get current git commit hash
+    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE GGML_BUILD_COMMIT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+
+    # Check if the working directory is dirty (i.e., has uncommitted changes)
+    execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        RESULT_VARIABLE GGML_GIT_DIRTY
+        ERROR_QUIET
+    )
+endif()
+
+# Build the version string with optional -dev suffix and dirty flag
+set(GGML_VERSION "${GGML_VERSION_BASE}${GGML_VERSION_DEV}")
+if(GGML_GIT_DIRTY AND NOT GGML_GIT_DIRTY EQUAL 0)
+    set(GGML_VERSION "${GGML_VERSION}-dirty")
+endif()
+
+if(NOT GGML_BUILD_COMMIT)
+    set(GGML_BUILD_COMMIT "unknown")
+endif()
+
 include(CheckIncludeFileCXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -129,10 +165,11 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
+option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
+option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)
-option(GGML_NNPA             "ggml: enable nnpa"             OFF)  # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -189,7 +226,6 @@ option(GGML_WEBGPU                          "ggml: use WebGPU"
 option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
 option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
-option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
@@ -300,26 +336,6 @@ endif()
 # Create CMake package
 #

-# Generate version info based on git commit.
-
-if(NOT DEFINED GGML_BUILD_NUMBER)
-    find_program(GIT_EXE NAMES git git.exe REQUIRED NO_CMAKE_FIND_ROOT_PATH)
-    execute_process(COMMAND ${GIT_EXE} rev-list --count HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE GGML_BUILD_NUMBER
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-
-    if(GGML_BUILD_NUMBER EQUAL 1)
-        message(WARNING "GGML build version fixed at 1 likely due to a shallow clone.")
-    endif()
-
-    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE GGML_BUILD_COMMIT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-endif()


 # Capture variables prefixed with GGML_.
@@ -348,7 +364,7 @@ set(GGML_VARIABLES_EXPANDED ${variable_set_statements})

 # Create the CMake package and set install location.

-set(GGML_INSTALL_VERSION 0.0.${GGML_BUILD_NUMBER})
+set(GGML_INSTALL_VERSION ${GGML_VERSION})
 set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
 set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -132,6 +132,8 @@ extern "C" {
        GGML_BACKEND_DEVICE_TYPE_CPU,
        // GPU device using dedicated memory
        GGML_BACKEND_DEVICE_TYPE_GPU,
+        // integrated GPU device using host memory
+        GGML_BACKEND_DEVICE_TYPE_IGPU,
        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
        GGML_BACKEND_DEVICE_TYPE_ACCEL
    };
@@ -150,11 +152,21 @@ extern "C" {

    // all the device properties
    struct ggml_backend_dev_props {
+        // device name
        const char * name;
+        // device description
        const char * description;
+        // device free memory in bytes
        size_t memory_free;
+        // device total memory in bytes
        size_t memory_total;
+        // device type
        enum ggml_backend_dev_type type;
+        // device id
+        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   if the id is unknown, this should be NULL
+        const char * device_id;
+        // device capabilities
        struct ggml_backend_dev_caps caps;
    };

@@ -307,6 +319,9 @@ extern "C" {
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

+    // Split graph without allocating it
+    GGML_API void                 ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+
    // Allocate and compute graph on the backend scheduler
    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -101,7 +101,6 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
-    GGML_BACKEND_API int ggml_cpu_has_nnpa       (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);

@@ -135,6 +134,7 @@ extern "C" {
    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);

    GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *,       float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *,     int32_t *, int64_t);
    GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
    GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
    GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -39,18 +39,13 @@ extern "C" {
 // user-code should use only these functions
 //

+// TODO: remove in the future
 GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);

 GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);

-GGML_DEPRECATED(
-        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
-
 GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);

-GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@@ -7,7 +7,8 @@
 extern "C" {
 #endif

-GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -284,19 +284,19 @@ __host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexc
 //    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
 //
 #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer)->array[0]; \
+    const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \
    GGML_UNUSED(prefix##0);
 #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer)->array[1]; \
+    const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \
    GGML_UNUSED(prefix##1);
 #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer)->array[2]; \
+    const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \
    GGML_UNUSED(prefix##2);
 #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer)->array[3]; \
+    const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \
    GGML_UNUSED(prefix##3);

 #define GGML_TENSOR_UNARY_OP_LOCALS \
@@ -511,6 +511,7 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
+        GGML_OP_IM2COL_3D,
        GGML_OP_CONV_2D,
        GGML_OP_CONV_3D,
        GGML_OP_CONV_2D_DW,
@@ -1403,6 +1404,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // note: casting from f32 to i32 will discard the fractional part
    GGML_API struct ggml_tensor * ggml_cast(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1527,7 +1529,11 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // supports 3D: a->ne[2] == b->ne[1]
+    // supports 4D a:
+    // a     [n_embd, ne1, ne2, ne3]
+    // b I32 [n_rows, ne2, ne3, 1]
+    //
+    // return [n_embd, n_rows, ne2, ne3]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,  // data
@@ -1870,6 +1876,41 @@ extern "C" {
            int                   d0,  // dilation dimension 0
            int                   d1); // dilation dimension 1

+    GGML_API struct ggml_tensor * ggml_im2col_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int64_t               IC,
+            int                   s0, // stride width
+            int                   s1, // stride height
+            int                   s2, // stride depth
+            int                   p0, // padding width
+            int                   p1, // padding height
+            int                   p2, // padding depth
+            int                   d0, // dilation width
+            int                   d1, // dilation height
+            int                   d2, // dilation depth
+            enum ggml_type        dst_type);
+
+    // a: [OC*IC, KD, KH, KW]
+    // b: [N*IC, ID, IH, IW]
+    // result: [N*OC, OD, OH, OW]
+    GGML_API struct ggml_tensor * ggml_conv_3d(
+                struct ggml_context * ctx,
+                struct ggml_tensor  * a,
+                struct ggml_tensor  * b,
+                int64_t               IC,
+                int                   s0, // stride width
+                int                   s1, // stride height
+                int                   s2, // stride depth
+                int                   p0, // padding width
+                int                   p1, // padding height
+                int                   p2, // padding depth
+                int                   d0, // dilation width
+                int                   d1, // dilation height
+                int                   d2  // dilation depth
+        );
+
    // kernel size is a->ne[0] x a->ne[1]
    // stride is equal to kernel size
    // padding is zero
@@ -1941,7 +1982,7 @@ extern "C" {
            int                   d0,  // dilation dimension 0
            int                   d1); // dilation dimension 1

-    GGML_API struct ggml_tensor * ggml_conv_3d(
+    GGML_API struct ggml_tensor * ggml_conv_3d_direct(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,   // kernel [KW, KH, KD, IC * OC]
            struct ggml_tensor  * b,   // input  [W, H, D, C * N]
@@ -2048,6 +2089,19 @@ extern "C" {
            int                  p2,
            int                  p3);

+    GGML_API struct ggml_tensor * ggml_pad_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  lp0,
+            int                  rp0,
+            int                  lp1,
+            int                  rp1,
+            int                  lp2,
+            int                  rp2,
+            int                  lp3,
+            int                  rp3
+            );
+
    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
    GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
            struct ggml_context * ctx,
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -114,6 +114,9 @@ message(STATUS "GGML_SYSTEM_ARCH: ${GGML_SYSTEM_ARCH}")

 if (NOT MSVC)
    if (GGML_STATIC)
+        if (UNIX AND NOT APPLE)
+            set(CMAKE_FIND_LIBRARY_SUFFIXES ".a;.so")
+        endif()
        add_link_options(-static)
        if (MINGW)
            add_link_options(-static-libgcc -static-libstdc++)
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
 }

 // ops that return true for this function must not use restrict pointers for their backend implementations
-static bool ggml_op_can_inplace(enum ggml_op op) {
+bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
        case GGML_OP_SCALE:
        case GGML_OP_DIAG_MASK_ZERO:
@@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te

 // dynamic tensor allocator

+#define GGML_VBUFFER_MAX_CHUNKS 16
+
+// relative memory address within an allocation that can be split into multiple buffers (chunks)
+struct buffer_address {
+    int chunk;     // index of a backend buffer
+    size_t offset; // local memory offset within the buffer
+};
+
+static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
+
+static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
+    return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
+}
+
 struct free_block {
    size_t offset;
    size_t size;
 };

+struct tallocr_chunk {
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    int n_free_blocks;
+    size_t max_size;
+};
+
 struct ggml_dyn_tallocr {
    size_t alignment;
-    int n_free_blocks;
-    struct free_block free_blocks[MAX_FREE_BLOCKS];
-    size_t max_size;
+    size_t max_chunk_size;
+    struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
+    int n_chunks;

 #ifdef GGML_ALLOCATOR_DEBUG
    struct {
        const struct ggml_tensor * tensor;
-        size_t offset;
+        struct buffer_address addr;
    } allocated_tensors[1024];
 #endif
 };

+static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
+    GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
+        chunk->free_blocks[i] = chunk->free_blocks[i-1];
+    }
+    // insert the new block
+    chunk->free_blocks[insert_pos].offset = offset;
+    chunk->free_blocks[insert_pos].size = size;
+    chunk->n_free_blocks++;
+}
+
+static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
+    // shift all elements after idx by 1 to the left, overwriting the element at idx
+    for (int i = idx; i < chunk->n_free_blocks; i++) {
+        chunk->free_blocks[i] = chunk->free_blocks[i+1];
+    }
+    chunk->n_free_blocks--;
+}
+
+static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
+    if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
+        return -1;
+    }
+    struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
+    chunk->n_free_blocks = 1;
+    chunk->free_blocks[0].offset = 0;
+    // available space in a chunk is limited to max_chunk_size, but can be higher if:
+    // 1. a single tensor exceeds the maximum, and cannot fit any other way
+    // 2. we are running out of chunks
+    // backends will either manage to allocate the larger size, or report an error.
+    chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
+    if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
+        chunk->free_blocks[0].size = SIZE_MAX/2;
+    }
+    alloc->chunks[alloc->n_chunks] = chunk;
+    alloc->n_chunks++;
+    return alloc->n_chunks - 1;
+}
+
 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i].tensor == NULL) {
            alloc->allocated_tensors[i].tensor = tensor;
-            alloc->allocated_tensors[i].offset = offset;
+            alloc->allocated_tensors[i].addr = addr;
            return;
        }
    }
    GGML_ABORT("out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
-        if (alloc->allocated_tensors[i].offset == offset) {
+        if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
            alloc->allocated_tensors[i].tensor = NULL;
            return;
        }
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
 }
 #endif

-static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
+static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);

+    int best_fit_chunk = -1;
+    int best_fit_block = -1;
    size_t max_avail = 0;

-    // find the best fitting free block besides the last block
-    int best_fit_block = -1;
-    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size && block->size <= best_fit_size) {
-            best_fit_block = i;
-            best_fit_size = block->size;
+    // find the best fitting free block besides the last block, within any chunk
+    for (int c = 0; c < alloc->n_chunks; ++c) {
+        struct tallocr_chunk * chunk = alloc->chunks[c];
+        size_t best_fit_size = SIZE_MAX;
+        for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
+            struct free_block * block = &chunk->free_blocks[i];
+            max_avail = MAX(max_avail, block->size);
+            if (block->size >= size && block->size <= best_fit_size) {
+                best_fit_chunk = c;
+                best_fit_block = i;
+                best_fit_size = block->size;
+            }
        }
    }

    if (best_fit_block == -1) {
-        // the last block is our last resort
-        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        max_avail = MAX(max_avail, block->size);
-        if (block->size >= size) {
-            best_fit_block = alloc->n_free_blocks - 1;
-        } else {
-            // this should never happen
-            GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
-                    __func__, size, max_avail);
-            GGML_ABORT("not enough space in the buffer");
-        }
-    }
-
-    struct free_block * block = &alloc->free_blocks[best_fit_block];
-    size_t offset = block->offset;
-    block->offset = offset + size;
-    block->size -= size;
-    if (block->size == 0) {
-        // remove block if empty
-        alloc->n_free_blocks--;
-        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
-            alloc->free_blocks[j] = alloc->free_blocks[j+1];
-        }
-    }
-
-    AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
-
-#ifdef GGML_ALLOCATOR_DEBUG
-    add_allocated_tensor(alloc, offset, tensor);
-    size_t cur_max = offset + size;
-    if (cur_max > alloc->max_size) {
-        // sort allocated_tensors by offset
-        for (int i = 0; i < 1024; i++) {
-            for (int j = i + 1; j < 1024; j++) {
-                if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
-                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
-                    size_t tmp_offset = alloc->allocated_tensors[i].offset;
-                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
-                    alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
-                    alloc->allocated_tensors[j].tensor = tmp_tensor;
-                    alloc->allocated_tensors[j].offset = tmp_offset;
+        // no suitable block found, try the last block (this will grow a chunks size)
+        for (int c = 0; c < alloc->n_chunks; ++c) {
+            struct tallocr_chunk * chunk = alloc->chunks[c];
+            if (chunk->n_free_blocks > 0) {
+                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
+                max_avail = MAX(max_avail, block->size);
+                if (block->size >= size) {
+                    best_fit_chunk = c;
+                    best_fit_block = chunk->n_free_blocks - 1;
+                    break;
                }
            }
        }
-        GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+    }
+
+    if (best_fit_block == -1) {
+        // none of the existing chunks have enough space left
+        best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
+        best_fit_block = 0;
+    }
+    if (best_fit_chunk == -1) {
+        // since the last chunk always has virtually endless memory, this should never happen
+        GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            __func__, size, max_avail);
+        GGML_ABORT("graph allocation: failed to reserve memory");
+    }
+
+    struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
+    struct free_block    * block = &chunk->free_blocks[best_fit_block];
+    struct buffer_address  addr  = {.chunk = best_fit_chunk, .offset = block->offset };
+    block->offset += size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
+    }
+
+    AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, addr, tensor);
+    size_t cur_max = addr.offset + size;
+    if (cur_max > alloc->max_size[addr.chunk]) {
+        // sort allocated_tensors by chunk/offset
+        for (int i = 0; i < 1024; i++) {
+            for (int j = i + 1; j < 1024; j++) {
+                if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
+                    const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
+                    struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
+                    alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
+                    alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
+                    alloc->allocated_tensors[j].tensor = tmp_tensor;
+                    alloc->allocated_tensors[j].addr = tmp_addr;
+                }
+            }
+        }
+        GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
        for (int i = 0; i < 1024; i++) {
            if (alloc->allocated_tensors[i].tensor) {
-                GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
-                    alloc->allocated_tensors[i].offset,
-                    alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
+                GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                    alloc->allocated_tensors[i].addr.chunk,
+                    alloc->allocated_tensors[i].addr.offset,
+                    alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
                    ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
            }
        }
@@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
    }
 #endif

-    alloc->max_size = MAX(alloc->max_size, offset + size);
+    chunk->max_size = MAX(chunk->max_size, addr.offset + size);

-    return offset;
+    return addr;

    GGML_UNUSED(tensor);
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
    size = aligned_offset(NULL, size, alloc->alignment);

-    AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
+    AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
+        __func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);

 #ifdef GGML_ALLOCATOR_DEBUG
-    remove_allocated_tensor(alloc, offset, tensor);
+    remove_allocated_tensor(alloc, addr, tensor);
 #endif

+    struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
+
    // see if we can merge with an existing block
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
-        struct free_block * block = &alloc->free_blocks[i];
+    for (int i = 0; i < chunk->n_free_blocks; i++) {
+        struct free_block * block = &chunk->free_blocks[i];
        // check if ptr is at the end of the block
-        if (block->offset + block->size == offset) {
+        if (block->offset + block->size == addr.offset) {
            block->size += size;
            // check if we can merge with the next block
-            if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
-                block->size += alloc->free_blocks[i+1].size;
-                alloc->n_free_blocks--;
-                for (int j = i+1; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+            if (i < chunk->n_free_blocks - 1) {
+                struct free_block * next = &chunk->free_blocks[i+1];
+                if (block->offset + block->size == next->offset) {
+                    block->size += next->size;
+                    ggml_dyn_tallocr_remove_block(chunk, i+1);
                }
            }
            return;
        }
        // check if ptr is at the beginning of the block
-        if (offset + size == block->offset) {
-            block->offset = offset;
+        if (addr.offset + size == block->offset) {
+            block->offset = addr.offset;
            block->size += size;
            // check if we can merge with the previous block
-            if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
-                alloc->free_blocks[i-1].size += block->size;
-                alloc->n_free_blocks--;
-                for (int j = i; j < alloc->n_free_blocks; j++) {
-                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+            if (i > 0) {
+                struct free_block * prev = &chunk->free_blocks[i-1];
+                if (prev->offset + prev->size == block->offset) {
+                    prev->size += block->size;
+                    ggml_dyn_tallocr_remove_block(chunk, i);
                }
            }
            return;
        }
    }
    // otherwise, add a new block
-    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
-    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
-    int insert_pos = 0;
-    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
-        insert_pos++;
-    }
-    // shift all blocks from insert_pos onward to make room for the new block
-    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
-        alloc->free_blocks[i] = alloc->free_blocks[i-1];
-    }
-    // insert the new block
-    alloc->free_blocks[insert_pos].offset = offset;
-    alloc->free_blocks[insert_pos].size = size;
-    alloc->n_free_blocks++;
+    ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);

    GGML_UNUSED(tensor);
 }

 static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
-    alloc->n_free_blocks = 1;
-    alloc->free_blocks[0].offset = 0;
-    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
-    alloc->max_size = 0;
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
+        free(alloc->chunks[i]);
+        alloc->chunks[i] = NULL;
+    }
+    alloc->n_chunks = 0;

 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
@@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 #endif
 }

-static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
    struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));

    *alloc = (struct ggml_dyn_tallocr) {
-        /*.alignment     = */ alignment,
-        /*.n_free_blocks = */ 0,
-        /*.free_blocks   = */ {{0}},
-        /*.max_size      = */ 0,
+        /*.alignment      = */ alignment,
+        /*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
+        /*.chunks         = */ {NULL},
+        /*.n_chunks       = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ {{0}},
 #endif
@@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
 }

 static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
+    for (int i = 0; i < alloc->n_chunks; ++i) {
+        free(alloc->chunks[i]);
+    }
    free(alloc);
 }

 static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
-    return alloc->max_size;
+    size_t max_size = 0;
+    for (int i = 0; i < alloc->n_chunks; i++) {
+        max_size += alloc->chunks[i]->max_size;
+    }
+    return max_size;
+}
+
+
+// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
+
+struct vbuffer {
+    ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
+};
+
+static void ggml_vbuffer_free(struct vbuffer * buf) {
+    if (buf == NULL) {
+        return;
+    }
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
+        ggml_backend_buffer_free(buf->chunks[i]);
+    }
+    free(buf);
+}
+
+static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
+    int n = 0;
+    while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
+    return n;
+}
+
+static size_t ggml_vbuffer_size(struct vbuffer * buf) {
+    size_t size = 0;
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
+        size += ggml_backend_buffer_get_size(buf->chunks[i]);
+    }
+    return size;
+}
+
+static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
+    struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
+    if (buf == NULL) {
+        return NULL;
+    }
+
+    for (int n = 0; n < talloc->n_chunks; n++) {
+        size_t chunk_size = talloc->chunks[n]->max_size;
+        buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
+        if (buf->chunks[n] == NULL) {
+            ggml_vbuffer_free(buf);
+            return NULL;
+        }
+        ggml_backend_buffer_set_usage(buf->chunks[n], usage);
+    }
+    return buf;
+}
+
+static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
+    void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
+    void * addr = (char *)base + buf_addr.offset;
+    ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
+}
+
+static void ggml_vbuffer_reset(struct vbuffer * buf) {
+    for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
+        ggml_backend_buffer_reset(buf->chunks[i]);
+    }
 }


@@ -328,13 +470,13 @@ struct hash_node {
    int n_children;
    int n_views;
    int buffer_id;
-    size_t offset; // offset within the buffer
+    struct buffer_address addr;
    bool allocated;
 };

 struct tensor_alloc {
    int buffer_id;
-    size_t offset;
+    struct buffer_address addr;
    size_t size_max; // 0 = pre-allocated, unused, or view
 };

@@ -349,7 +491,7 @@ struct node_alloc {

 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
-    ggml_backend_buffer_t * buffers; // [n_buffers]
+    struct vbuffer ** buffers; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -370,7 +512,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
    GGML_ASSERT(galloc->bufts != NULL);

-    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+    galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
    GGML_ASSERT(galloc->buffers != NULL);

    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
@@ -390,7 +532,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs

        if (galloc->buf_tallocs[i] == NULL) {
            size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
-            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+            size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
+            galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
        }
    }
    galloc->n_buffers = n_bufs;
@@ -418,7 +561,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
                }
            }
            if (!freed) {
-                ggml_backend_buffer_free(galloc->buffers[i]);
+                ggml_vbuffer_free(galloc->buffers[i]);
            }
        }
        if (galloc->buf_tallocs != NULL) {
@@ -467,7 +610,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor

    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
        hn->allocated = true;
-        assert(hn->offset == 0);
+        assert(hn->addr.offset == 0);

        // try to reuse a parent's buffer (inplace)
        if (ggml_op_can_inplace(node->op)) {
@@ -501,9 +644,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                        struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
                        if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                            AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                            assert(view_src_hn->offset == p_hn->offset);
+                            assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
                            hn->buffer_id = p_hn->buffer_id;
-                            hn->offset = p_hn->offset;
+                            hn->addr = p_hn->addr;
                            p_hn->allocated = false; // avoid freeing the parent
                            view_src_hn->allocated = false;
                            return;
@@ -511,7 +654,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
                    } else {
                        AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                        hn->buffer_id = p_hn->buffer_id;
-                        hn->offset = p_hn->offset;
+                        hn->addr = p_hn->addr;
                        p_hn->allocated = false; // avoid freeing the parent
                        return;
                    }
@@ -522,9 +665,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
        struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
        ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
        size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-        size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
        hn->buffer_id = buffer_id;
-        hn->offset = offset;
+        hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
    }
 }

@@ -536,12 +678,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
    }

    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-    size_t offset = hn->offset;
    int buffer_id = hn->buffer_id;
    struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
    ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
    size_t size = ggml_backend_buft_get_alloc_size(buft, node);
-    ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
+    ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
    hn->allocated = false;
 }

@@ -692,24 +833,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct node_alloc * node_alloc = &galloc->node_allocs[i];
        if (node->view_src || node->data) {
            node_alloc->dst.buffer_id = -1;
-            node_alloc->dst.offset = SIZE_MAX;
+            node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
            node_alloc->dst.size_max = 0;
        } else {
            struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
            node_alloc->dst.buffer_id = hn->buffer_id;
-            node_alloc->dst.offset    = hn->offset;
+            node_alloc->dst.addr = hn->addr;
            node_alloc->dst.size_max  = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
        }
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            struct ggml_tensor * src = node->src[j];
            if (!src || src->view_src || src->data) {
                node_alloc->src[j].buffer_id = -1;
-                node_alloc->src[j].offset = SIZE_MAX;
+                node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
                node_alloc->src[j].size_max = 0;
            } else {
                struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
                node_alloc->src[j].buffer_id = hn->buffer_id;
-                node_alloc->src[j].offset   = hn->offset;
+                node_alloc->src[j].addr = hn->addr;
                node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
            }
        }
@@ -725,11 +866,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
        if (leaf->view_src || leaf->data) {
            galloc->leaf_allocs[i].leaf.buffer_id = -1;
-            galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
+            galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
            galloc->leaf_allocs[i].leaf.size_max = 0;
        } else {
            galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
-            galloc->leaf_allocs[i].leaf.offset = hn->offset;
+            galloc->leaf_allocs[i].leaf.addr = hn->addr;
            galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
        }
    }
@@ -744,7 +885,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            }
        }

-        size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
+        size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
        size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);

        // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
@@ -753,13 +894,12 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
            GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif

-            ggml_backend_buffer_free(galloc->buffers[i]);
-            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+            ggml_vbuffer_free(galloc->buffers[i]);
+            galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
            if (galloc->buffers[i] == NULL) {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }

@@ -772,11 +912,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {

 static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
    int buffer_id = tensor_alloc->buffer_id;
-    assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
+    assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);

    if (tensor->view_src != NULL) {
        if (tensor->buffer == NULL) {
-            assert(tensor_alloc->offset == SIZE_MAX);
+            assert(tensor_alloc->addr.offset == SIZE_MAX);
            if (tensor->view_src->buffer == NULL) {
                // this tensor was allocated without ggml-backend
                return;
@@ -785,11 +925,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
        }
    } else {
        if (tensor->data == NULL) {
-            assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
-            void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
+            assert(tensor_alloc->addr.offset != SIZE_MAX);
+            assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
+            ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
        } else {
            if (tensor->buffer == NULL) {
                // this tensor was allocated without ggml-backend
@@ -874,7 +1012,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
    // reset buffers
    for (int i = 0; i < galloc->n_buffers; i++) {
        if (galloc->buffers[i] != NULL) {
-            ggml_backend_buffer_reset(galloc->buffers[i]);
+            ggml_vbuffer_reset(galloc->buffers[i]);
        }
    }

@@ -917,7 +1055,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
        }
    }

-    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+    return ggml_vbuffer_size(galloc->buffers[buffer_id]);
 }

 // utils
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@@ -8,7 +8,7 @@
 extern "C" {
 #endif

-    #define GGML_BACKEND_API_VERSION 1
+    #define GGML_BACKEND_API_VERSION 2

    //
    // Backend buffer type
@@ -114,6 +114,9 @@ extern "C" {
        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
        // wait for an event on on a different stream
        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+
+        // (optional) sort/optimize the nodes in the graph
+        void                      (*graph_optimize)    (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    };

    struct ggml_backend {
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const

 ggml_backend_t ggml_backend_init_best(void) {
    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
-    if (!dev) {
-        dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-    }
+    dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
    if (!dev) {
        return nullptr;
    }
--- a/Show More
+++ b/Show More