Update

Hack test-bench
2026-02-12 14:03:20 +02:00 · 2024-04-22 10:42:32 +01:00 · 2024-04-12 11:26:36 +01:00
269 changed files with 51995 additions and 72964 deletions
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -10,12 +10,14 @@ WORKDIR /app

 COPY . .

-RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN mkdir build && \
+    cd build && \
+    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target main
+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake --build . --config Release --target main

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -14,8 +14,10 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DLLAMA_VULKAN=1 && \
-    cmake --build build --config Release --target main
+RUN mkdir build && \
+    cd build && \
+    cmake .. -DLLAMA_VULKAN=1 && \
+    cmake --build . --config Release --target main

 # Clean up
 WORKDIR /
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -10,12 +10,14 @@ WORKDIR /app

 COPY . .

-RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN mkdir build && \
+    cd build && \
+    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
-    cmake --build build --config Release --target server
+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build . --config Release --target server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

--- a/.devops/server-vulkan.Dockerfile
+++ b/.devops/server-vulkan.Dockerfile
@@ -18,8 +18,10 @@ RUN apt-get update && \
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
-    cmake --build build --config Release --target server
+RUN mkdir build && \
+    cd build && \
+    cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build . --config Release --target server

 # Clean up
 WORKDIR /
--- a/.flake8
+++ b/.flake8
@@ -1,17 +1,3 @@
 [flake8]
 max-line-length = 125
-ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
-exclude =
-    # Do not traverse examples
-    examples,
-    # Do not include package initializers
-    __init__.py,
-    # No need to traverse our git directory
-    .git,
-    # There's no value in checking cache directories
-    __pycache__,
-    # No need to include the build path
-    build,
-    # This contains builds that we don't want to check
-    dist  # This is generated with `python build .` for package releases
-# max-complexity = 10
+ignore = W503
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -32,7 +32,7 @@ on:
    -  cron: '04 2 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true

 jobs:
@@ -52,19 +52,7 @@ jobs:
            ftype: q4_0
            pr_comment_enabled: "true"

-    if: |
-      inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || (
-        github.event_name == 'schedule'
-        && github.ref_name == 'master'
-        && github.repository_owner == 'ggerganov'
-      )
-      || github.event_name == 'pull_request_target'
-      || (
-        github.event_name == 'push'
-        && github.event.ref == 'refs/heads/master'
-        && github.repository_owner == 'ggerganov'
-      )
+    if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
    steps:
      - name: Clone
        id: checkout
@@ -108,7 +96,9 @@ jobs:
        id: cmake_build
        run: |
          set -eux
-          cmake -B build \
+          mkdir build
+          cd build
+          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
@@ -119,7 +109,7 @@ jobs:
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build build --config Release -j $(nproc) --target server
+          cmake --build . --config Release -j $(nproc) --target server

      - name: Download the dataset
        id: download_dataset
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,8 +32,6 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -90,8 +88,6 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -210,8 +206,6 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -244,33 +238,6 @@ jobs:
          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          cp LICENSE ./build/bin/
-          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
-        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
-          name: llama-bin-ubuntu-x64.zip
-
 #  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
@@ -340,36 +307,6 @@ jobs:
          cd build
          ctest -L main --verbose

-  ubuntu-latest-cmake-rpc:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake -DLLAMA_RPC=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
  ubuntu-22-cmake-vulkan:
    runs-on: ubuntu-22.04

@@ -623,63 +560,6 @@ jobs:
        run: |
            make swift

-  windows-msys2:
-    runs-on: windows-latest
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
-          - { sys: CLANG64, env: clang-x86_64, build: Release }
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-
-      - name: Setup ${{ matrix.sys }}
-        uses: msys2/setup-msys2@v2
-        with:
-          update: true
-          msystem: ${{matrix.sys}}
-          install: >-
-            base-devel
-            mingw-w64-${{matrix.env}}-toolchain
-            mingw-w64-${{matrix.env}}-cmake
-            mingw-w64-${{matrix.env}}-openblas
-
-      - name: Build using make
-        shell: msys2 {0}
-        run: |
-            make -j $(nproc)
-
-      - name: Clean after building using make
-        shell: msys2 {0}
-        run: |
-            make clean
-
-      - name: Build using make w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            make LLAMA_OPENBLAS=1 -j $(nproc)
-
-      - name: Build using CMake
-        shell: msys2 {0}
-        run: |
-            cmake -B build
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
-      - name: Clean after building using CMake
-        shell: msys2 {0}
-        run: |
-            rm -rf build
-
-      - name: Build using CMake w/ OpenBLAS
-        shell: msys2 {0}
-        run: |
-            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-            cmake --build build --config ${{ matrix.build }} -j $(nproc)
-
  windows-latest-cmake:
    runs-on: windows-latest

@@ -693,28 +573,24 @@ jobs:
    strategy:
      matrix:
        include:
-          - build: 'rpc-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'noavx-x64'
+          - build: 'noavx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2-x64'
+          - build: 'avx2'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx-x64'
+          - build: 'avx'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512-x64'
+          - build: 'avx512'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast-x64'
+          - build: 'clblast'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
-          - build: 'openblas-x64'
+          - build: 'openblas'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'kompute-x64'
+          - build: 'kompute'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'vulkan-x64'
+          - build: 'vulkan'
            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'llvm-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'msvc-arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'arm64'
+            defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@@ -725,13 +601,13 @@ jobs:

      - name: Clone Kompute submodule
        id: clone_kompute
-        if: ${{ matrix.build == 'kompute-x64' }}
+        if: ${{ matrix.build == 'kompute' }}
        run: |
          git submodule update --init kompute

      - name: Download OpenCL SDK
        id: get_opencl
-        if: ${{ matrix.build == 'clblast-x64' }}
+        if: ${{ matrix.build == 'clblast' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
          mkdir $env:RUNNER_TEMP/opencl
@@ -739,7 +615,7 @@ jobs:

      - name: Download CLBlast
        id: get_clblast
-        if: ${{ matrix.build == 'clblast-x64' }}
+        if: ${{ matrix.build == 'clblast' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@@ -752,7 +628,7 @@ jobs:

      - name: Download OpenBLAS
        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
+        if: ${{ matrix.build == 'openblas' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -765,41 +641,38 @@ jobs:

      - name: Install Vulkan SDK
        id: get_vulkan
-        if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
+        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
        run: |
          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"

-      - name: Install Ninja
-        id: install_ninja
-        run: |
-          choco install ninja
-
      - name: Build
        id: cmake_build
        run: |
-          cmake -S . -B build ${{ matrix.defines }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          mkdir build
+          cd build
+          cmake .. ${{ matrix.defines }}
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Add clblast.dll
        id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast-x64' }}
+        if: ${{ matrix.build == 'clblast' }}
        run: |
          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt

      - name: Add libopenblas.dll
        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
+        if: ${{ matrix.build == 'openblas' }}
        run: |
          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt

      - name: Check AVX512F support
        id: check_avx512f
-        if: ${{ matrix.build == 'avx512-x64' }}
+        if: ${{ matrix.build == 'avx512' }}
        continue-on-error: true
        run: |
          cd build
@@ -813,14 +686,14 @@ jobs:
      - name: Test
        id: cmake_test
        # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
        run: |
          cd build
          ctest -L main -C Release --verbose --timeout 900

      - name: Test (Intel SDE)
        id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
@@ -848,14 +721,14 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
+          name: llama-bin-win-${{ matrix.build }}-x64.zip

  windows-latest-cmake-cuda:
    runs-on: windows-latest
@@ -935,9 +808,9 @@ jobs:
        shell: bash

    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+
    steps:
      - name: Clone
        id: checkout
@@ -969,17 +842,6 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
-          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*

      - name: Upload artifacts
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -12,7 +12,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
+          exempt-issue-labels: "refactor,help wanted,good first issue,research"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -91,12 +91,6 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi

-      - name: Downcase github.repository_owner
-        run: |
-          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
-        env:
-          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
-
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@@ -104,7 +98,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -113,5 +107,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -20,4 +20,5 @@ jobs:
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
-            plugins: "flake8-no-print"
+            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
+            exclude: "examples/*,examples/*/**,*/**/__init__.py"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -23,7 +23,7 @@ on:
    -  cron: '2 4 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -41,16 +41,23 @@ jobs:
            sanitizer: ""
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

+    container:
+      image: ubuntu:latest
+      ports:
+        - 8888
+      options: --cpus 4
+
    steps:
      - name: Dependencies
        id: depends
        run: |
-          sudo apt-get update
-          sudo apt-get -y install \
+          apt-get update
+          apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
+            python3-pip \
            curl \
            wget \
            language-pack-en \
@@ -63,17 +70,6 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

-      - name: Python setup
-        id: setup_python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt
-
      - name: Verify server deps
        id: verify_server_deps
        run: |
@@ -94,14 +90,20 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build \
+          mkdir build
+          cd build
+          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
+          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server

+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r examples/server/tests/requirements.txt

      - name: Tests
        id: server_integration_tests
@@ -127,7 +129,6 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
-          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: libCURL
        id: get_libcurl
@@ -141,8 +142,10 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server

      - name: Python setup
        id: setup_python
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 *.a
 *.so
 *.gguf
-*.gguf.json
 *.bin
 *.exe
 *.dll
@@ -35,7 +34,6 @@ lcov-report/
 gcovr-report/

 build*
-!build.zig
 cmake-build-*
 out/
 tmp/
@@ -102,25 +100,7 @@ qnt-*.txt
 perf-*.txt

 examples/jeopardy/results.txt
-examples/server/*.html.hpp
-examples/server/*.js.hpp
-examples/server/*.mjs.hpp

 poetry.lock
 poetry.toml
 nppBackup
-
-# Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0
-/tests/test-tokenizer-1-spm
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,14 +3,13 @@
 exclude: prompts/.*.txt
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.6.0
+  rev: v3.2.0
  hooks:
  - id: trailing-whitespace
  - id: end-of-file-fixer
  - id: check-yaml
  - id: check-added-large-files
 - repo: https://github.com/PyCQA/flake8
-  rev: 7.0.0
+  rev: 6.0.0
  hooks:
  -   id: flake8
-      additional_dependencies: [flake8-no-print]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,8 +43,6 @@ else()
    set(LLAMA_METAL_DEFAULT OFF)
 endif()

-set(LLAMA_LLAMAFILE_DEFAULT ON)
-
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
@@ -90,7 +88,6 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
-option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
@@ -103,8 +100,6 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
-option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
-
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
@@ -123,7 +118,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
 option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
-option(LLAMA_RPC                             "llama: use RPC"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
@@ -292,12 +286,11 @@ if (LLAMA_METAL)
        ${METALKIT_FRAMEWORK}
        )
 endif()
-
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
+    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
        set(BLA_SIZEOF_INTEGER 8)
    endif()

@@ -375,13 +368,6 @@ if (LLAMA_BLAS)
    endif()
 endif()

-if (LLAMA_LLAMAFILE)
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    set(GGML_HEADERS_LLAMAFILE sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
-endif()
-
 if (LLAMA_QKK_64)
    add_compile_definitions(GGML_QKK_64)
 endif()
@@ -406,16 +392,12 @@ if (LLAMA_CUDA)
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")

        add_compile_definitions(GGML_USE_CUDA)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
        if (LLAMA_CUDA_FORCE_MMQ)
            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
        endif()
-        if (LLAMA_CUDA_NO_VMM)
-            add_compile_definitions(GGML_CUDA_NO_VMM)
-        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -432,7 +414,7 @@ if (LLAMA_CUDA)

        if (LLAMA_STATIC)
            if (WIN32)
-                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -441,11 +423,7 @@ if (LLAMA_CUDA)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

-        if (LLAMA_CUDA_NO_VMM)
-            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
-        else()
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
-        endif()
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)

    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
@@ -495,17 +473,6 @@ if (LLAMA_MPI)
    endif()
 endif()

-if (LLAMA_RPC)
-    add_compile_definitions(GGML_USE_RPC)
-
-    if (WIN32)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
-    endif()
-
-    set(GGML_HEADERS_RPC ggml-rpc.h)
-    set(GGML_SOURCES_RPC ggml-rpc.cpp)
-endif()
-
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@@ -1007,11 +974,6 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
        if (GGML_COMPILER_SUPPORT_DOTPROD)
            add_compile_definitions(__ARM_FEATURE_DOTPROD)
        endif ()
-        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
-        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
-            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
-        endif ()
-
        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -1189,17 +1151,15 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
-            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
-            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
-            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
-            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
-            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
+            ${GGML_SOURCES_CUDA}    ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_OPENCL}  ${GGML_HEADERS_OPENCL}
+            ${GGML_SOURCES_METAL}   ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
+            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
+            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
+            ${GGML_SOURCES_ROCM}    ${GGML_HEADERS_ROCM}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@@ -1299,6 +1259,17 @@ install(
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(
+    FILES convert-lora-to-ggml.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
 if (LLAMA_METAL)
    install(
        FILES ggml-metal.metal
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,45 +0,0 @@
-{
-  "version": 4,
-  "configurePresets": [
-    {
-        "name":  "base",
-        "hidden": true,
-        "generator":   "Ninja",
-        "binaryDir":   "${sourceDir}/build-${presetName}",
-        "cacheVariables": {
-            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
-            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
-        }
-    },
-
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
-
-    {
-        "name": "arm64-windows-msvc", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
-        }
-    },
-
-    {
-        "name": "arm64-windows-llvm", "hidden": true,
-        "architecture": { "value": "arm64",       "strategy": "external" },
-        "toolset":      { "value": "host=x86_64", "strategy": "external" },
-        "cacheVariables": {
-            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
-        }
-    },
-
-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "release" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "release", "static" ] },
-
-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
-    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "release" ] },
-    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "release", "static" ] }
-  ]
-}
--- a/87
+++ b/87
@@ -6,23 +6,11 @@ BUILD_TARGETS = \

 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-autorelease \
-	tests/test-backend-ops \
-	tests/test-double-float \
-	tests/test-grad0 \
-	tests/test-grammar-integration \
-	tests/test-grammar-parser \
-	tests/test-json-schema-to-grammar \
-	tests/test-llama-grammar \
-	tests/test-model-load-cancel \
-	tests/test-opt \
-	tests/test-quantize-fns \
-	tests/test-quantize-perf \
-	tests/test-rope \
-	tests/test-sampling \
-	tests/test-tokenizer-0 \
-	tests/test-tokenizer-1-bpe \
-	tests/test-tokenizer-1-spm
+	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
+	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
+	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
+	tests/test-json-schema-to-grammar tests/test-grammar-integration

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -39,17 +27,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

-# In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
-# of non-gcc compilers don't have to provide g++ alias or wrapper.
-DEFCC  := cc
-DEFCXX := c++
-ifeq ($(origin CC),default)
-CC  := $(DEFCC)
-endif
-ifeq ($(origin CXX),default)
-CXX := $(DEFCXX)
-endif
-
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@@ -72,16 +49,11 @@ default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
+		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
-			./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
@@ -412,11 +384,6 @@ ifdef LLAMA_OPENBLAS
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS

-ifndef LLAMA_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
-	OBJS        += sgemm.o
-endif
-
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
@@ -433,7 +400,7 @@ ifdef LLAMA_CUDA
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -513,9 +480,11 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
+
 endif # LLAMA_CUDA

 ifdef LLAMA_CLBLAST
+
 	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -634,11 +603,6 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

-ifndef LLAMA_NO_LLAMAFILE
-sgemm.o: sgemm.cpp sgemm.h ggml.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif
-
 GF_CC := $(CC)
 include scripts/get-flags.mk

@@ -723,8 +687,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o

 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -792,11 +756,11 @@ batched: examples/batched/batched.cpp                         ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -824,19 +788,10 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

-# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
-examples/server/%.hpp: examples/server/public/% Makefile
-	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
-		echo "unsigned char $${NAME}[] = {" && \
-		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
-		echo "};" && \
-		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
-	) > $@
-
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -999,7 +954,11 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -1007,7 +966,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

--- a/Package.swift
+++ b/Package.swift
@@ -2,45 +2,6 @@

 import PackageDescription

-var sources = [
-    "ggml.c",
-    "sgemm.cpp",
-    "llama.cpp",
-    "unicode.cpp",
-    "unicode-data.cpp",
-    "ggml-alloc.c",
-    "ggml-backend.c",
-    "ggml-quants.c",
-]
-
-var resources: [Resource] = []
-var linkerSettings: [LinkerSetting] = []
-var cSettings: [CSetting] =  [
-    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-    .unsafeFlags(["-fno-objc-arc"]),
-    // NOTE: NEW_LAPACK will required iOS version 16.4+
-    // We should consider add this in the future when we drop support for iOS 14
-    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-    // .define("ACCELERATE_NEW_LAPACK"),
-    // .define("ACCELERATE_LAPACK_ILP64")
-]
-
-#if canImport(Darwin)
-sources.append("ggml-metal.m")
-resources.append(.process("ggml-metal.metal"))
-linkerSettings.append(.linkedFramework("Accelerate"))
-cSettings.append(
-    contentsOf: [
-        .define("GGML_USE_ACCELERATE"),
-        .define("GGML_USE_METAL")
-    ]
-)
-#endif
-
-#if os(Linux)
-    cSettings.append(.define("_GNU_SOURCE"))
-#endif
-
 let package = Package(
    name: "llama",
    platforms: [
@@ -67,11 +28,34 @@ let package = Package(
               "ggml-cuda.h",
               "Makefile"
            ],
-            sources: sources,
-            resources: resources,
+            sources: [
+                "ggml.c",
+                "llama.cpp",
+                "unicode.cpp",
+                "unicode-data.cpp",
+                "ggml-alloc.c",
+                "ggml-backend.c",
+                "ggml-quants.c",
+                "ggml-metal.m",
+            ],
+            resources: [
+                .process("ggml-metal.metal")
+            ],
            publicHeadersPath: "spm-headers",
-            cSettings: cSettings,
-            linkerSettings: linkerSettings
+            cSettings: [
+                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags(["-fno-objc-arc"]),
+                .define("GGML_USE_METAL"),
+                // NOTE: NEW_LAPACK will required iOS version 16.4+
+                // We should consider add this in the future when we drop support for iOS 14
+                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+                // .define("ACCELERATE_NEW_LAPACK"),
+                // .define("ACCELERATE_LAPACK_ILP64")
+            ],
+            linkerSettings: [
+                .linkedFramework("Accelerate")
+            ]
        )
    ],
    cxxLanguageStandard: .cxx11
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -68,7 +68,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,

 | Intel GPU                     | Status  | Verified Model                        |
 |-------------------------------|---------|---------------------------------------|
-| Intel Data Center Max Series  | Support | Max 1550, 1100                        |
+| Intel Data Center Max Series  | Support | Max 1550                              |
 | Intel Data Center Flex Series | Support | Flex 170                              |
 | Intel Arc Series              | Support | Arc 770, 730M                         |
 | Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
@@ -84,7 +84,8 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 - **Execution Unit (EU)**
  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.

-### Other Vendor GPU
+### Nvidia GPU
+The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)

 **Verified devices**

@@ -93,9 +94,14 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 | Ampere Series            | Support | A100, A4000    |
 | Ampere Series *(Mobile)* | Support | RTX 40 Series  |

+*Notes:*
+  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
+
+  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
+
+
 ## Docker
 The docker build option is currently limited to *intel GPU* targets.
-
 ### Build image
 ```sh
 # Using FP16
@@ -162,10 +168,29 @@ Platform #0: Intel(R) OpenCL HD Graphics
 - **Nvidia GPU**

 In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
+Installation can be verified by running the following:
+```sh
+nvidia-smi
+```
+Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
+```
+---------------------------------------------------------------------------------------+
+| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
+|-----------------------------------------+----------------------+----------------------+
+| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
+|                                         |                      |               MIG M. |
+|=========================================+======================+======================|
+|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
+| N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
+|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
+```
+

 2. **Install Intel® oneAPI Base toolkit**

- **For Intel GPU**
+- **Base installation**

 The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

@@ -177,16 +202,17 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 - **Adding support to Nvidia GPUs**

-**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.


-**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.

 ```sh
 git clone https://github.com/oneapi-src/oneMKL
 cd oneMKL
-cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-cmake --build buildWithCublas --config Release
+mkdir -p buildWithCublas && cd buildWithCublas
+cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
+make
 ```


@@ -211,7 +237,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic

 - **Nvidia GPU**

-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
 [opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -226,15 +252,14 @@ Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA devic
 source /opt/intel/oneapi/setvars.sh

 # Build LLAMA with MKL BLAS acceleration for intel GPU
+mkdir -p build && cd build

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+# Option 1: Use FP16 for better performance in long-prompt  inference
+cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Or without "--build", run "make" next

-# Option 2: Use FP16
-cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-# build all binary
-cmake --build build --config Release -j -v
+# Option 2: Use FP32 by default
+cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

 #### Nvidia GPU
@@ -246,16 +271,13 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR

 # Build LLAMA with Nvidia BLAS acceleration through SYCL
+mkdir -p build && cd build

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-# build all binary
-cmake --build build --config Release -j -v
+# Option 1: Use FP16 for better performance in long-prompt  inference
+cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON

+# Option 2: Use FP32 by default
+cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

 ### III. Run the inference
@@ -335,6 +357,7 @@ Otherwise, you can run the script:

 *Notes:*

+- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

 ```sh
@@ -409,15 +432,13 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

 ```
+mkdir -p build
+cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

-# Option 2: Or FP16
-cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
-
-cmake --build build --config Release -j
+make
 ```

 Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -504,6 +525,7 @@ Otherwise, run the following wrapper script:

 Note:

+- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

 ```sh
@@ -535,6 +557,12 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 ## Known Issues

+- Hanging during startup
+
+  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
+
+  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
+
 - `Split-mode:[row]` is not supported.

 ## Q&A
@@ -546,7 +574,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 - General compiler error:

-  - Remove **build** folder or try a clean-build.
+  - Remove build folder or try a clean-build.

 - I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.

--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

@@ -10,7 +10,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
@@ -20,9 +19,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
+- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
 - Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
@@ -95,11 +92,9 @@ Typically finetunes of the base models below are supported as well.

 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
- [x] LLaMA 3 🦙🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
+- [X] Falcon
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -122,12 +117,10 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
 - [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
+- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

@@ -139,8 +132,6 @@ Typically finetunes of the base models below are supported as well.
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)

 **HTTP server**

@@ -176,7 +167,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -198,8 +188,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [MindMac](https://mindmac.app) (proprietary)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
-
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 ---
@@ -310,8 +298,6 @@ In order to build llama.cpp you have three different options.
      make
      ```

-      **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
-
  - On Windows:

    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@@ -326,26 +312,12 @@ In order to build llama.cpp you have three different options.
 - Using `CMake`:

    ```bash
-    cmake -B build
-    cmake --build build --config Release
+    mkdir build
+    cd build
+    cmake ..
+    cmake --build . --config Release
    ```

-    **Note**: for `Debug` builds, there are two cases:
-
-    - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
-
-      ```bash
-      cmake -B build -DCMAKE_BUILD_TYPE=Debug
-      cmake --build build
-      ```
-
-    - Multi-config generators (`-G` param set to Visual Studio, XCode...):
-
-      ```bash
-      cmake -B build -G "Xcode"
-      cmake --build build --config Debug
-      ```
-
 - Using `Zig` (version 0.11 or later):

    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
@@ -457,8 +429,10 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake` on Linux:

      ```bash
-      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-      cmake --build build --config Release
+      mkdir build
+      cd build
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+      cmake --build . --config Release
      ```

 - #### BLIS
@@ -478,9 +452,11 @@ Building the program with BLAS support may lead to some performance improvements
  - Using manual oneAPI installation:
    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
+      mkdir build
+      cd build
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build build --config Release
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
+      cmake --build . --config Release
      ```

  - Using oneAPI docker image:
@@ -501,8 +477,10 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake`:

    ```bash
-    cmake -B build -DLLAMA_CUDA=ON
-    cmake --build build --config Release
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CUDA=ON
+    cmake --build . --config Release
    ```

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
@@ -529,10 +507,10 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-        cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build --config Release -- -j 16
+        cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        && cmake --build build -- -j 16
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
+    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

  - Using `make` (example for target gfx1030, build with 16 CPU threads):
@@ -567,7 +545,7 @@ Building the program with BLAS support may lead to some performance improvements
  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
-    - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
+    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.

    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.

@@ -576,14 +554,15 @@ Building the program with BLAS support may lead to some performance improvements

        ```sh
        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
-        cd OpenCL-SDK
-        cmake -B build -DBUILD_DOCS=OFF \
+        mkdir OpenCL-SDK/build
+        cd OpenCL-SDK/build
+        cmake .. -DBUILD_DOCS=OFF \
          -DBUILD_EXAMPLES=OFF \
          -DBUILD_TESTING=OFF \
          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
          -DOPENCL_SDK_TEST_SAMPLES=OFF
-        cmake --build build
-        cmake --install build --prefix /some/path
+        cmake --build . --config Release
+        cmake --install . --prefix /some/path
        ```
      </details>

@@ -591,12 +570,6 @@ Building the program with BLAS support may lead to some performance improvements

  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.

-  Linux packaging:
-  Fedora Linux:
-  ```bash
-  sudo dnf install clblast
-  ```
-
  Alternatively, they may be built from source.

  - <details>
@@ -605,23 +578,23 @@ Building the program with BLAS support may lead to some performance improvements
      ```cmd
      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
      git clone https://github.com/CNugteren/CLBlast.git
-      cd CLBlast
-      cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
-      cmake --build build --config Release
-      cmake --install build --prefix C:/CLBlast
+      mkdir CLBlast\build
+      cd CLBlast\build
+      cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
+      cmake --build . --config Release
+      cmake --install . --prefix C:/CLBlast
      ```

-      (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
-
  - <details>
    <summary>Unix:</summary>

      ```sh
      git clone https://github.com/CNugteren/CLBlast.git
-      cd CLBlast
-      cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
-      cmake --build build --config Release
-      cmake --install build --prefix /some/path
+      mkdir CLBlast/build
+      cd CLBlast/build
+      cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+      cmake --build . --config Release
+      cmake --install . --prefix /some/path
      ```

      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
@@ -635,17 +608,21 @@ Building the program with BLAS support may lead to some performance improvements
    ```
  - CMake (Unix):
    ```sh
-    cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
-    cmake --build build --config Release
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+    cmake --build . --config Release
    ```
  - CMake (Windows):
    ```cmd
    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
    git clone https://github.com/ggerganov/llama.cpp
    cd llama.cpp
-    cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
-    cmake --build build --config Release
-    cmake --install build --prefix C:/LlamaCPP
+    mkdir build
+    cd build
+    cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
+    cmake --build . --config Release
+    cmake --install . --prefix C:/LlamaCPP
    ```

  ##### Running Llama with CLBlast
@@ -701,8 +678,10 @@ Building the program with BLAS support may lead to some performance improvements
  Then, build llama.cpp using the cmake command below:

  ```bash
-  cmake -B build -DLLAMA_VULKAN=1
-  cmake --build build --config Release
+  mkdir -p build
+  cd build
+  cmake .. -DLLAMA_VULKAN=1
+  cmake --build . --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
  ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4

@@ -712,13 +691,8 @@ Building the program with BLAS support may lead to some performance improvements

 ### Prepare and Quantize

-> [!NOTE]
-> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
-
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.

-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
-
 ```bash
 # obtain the official LLaMA model weights and place them in ./models
 ls ./models
@@ -940,25 +914,17 @@ If your issue is with model generation quality, then please at least scan the fo

 ### Android

-#### Build on Android using Termux
-[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
-```
-apt update && apt upgrade -y
-apt install git make cmake
-```
-
-It's recommended to move your model inside the `~/` directory for best performance:
-```
-cd storage/downloads
-mv model.gguf ~/
-```
-
-[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
-
 #### Building the Project using Android NDK
-Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
+You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
+
+First, install the essential packages for termux:
+```
+pkg install clang wget git cmake
+```
+Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
+
+You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.

-Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```
 $ mkdir build-android
 $ cd build-android
@@ -966,9 +932,7 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-
-Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
-
+Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:

 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@@ -990,10 +954,53 @@ $cd /data/data/com.termux/files/home/bin
 $./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
 ```

-Here's a demo of an interactive session running on Pixel 5 phone:
+Here is a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4

+#### Building the Project using Termux (F-Droid)
+Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
+
+Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
+
+If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
+```
+apt install libopenblas
+```
+
+Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
+```
+apt install ocl-icd opencl-headers opencl-clhpp clinfo
+```
+
+In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
+```
+cmake .
+make
+cp libclblast.so* $PREFIX/lib
+cp ./include/clblast.h ../llama.cpp
+```
+
+Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
+```
+cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
+cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
+make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
+```
+
+Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
+```
+GGML_OPENCL_PLATFORM=0
+GGML_OPENCL_DEVICE=0
+export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
+```
+
+(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
+
+For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
+
+Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
+
 ### Docker

 #### Prerequisites
@@ -1099,9 +1106,7 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
-
-![matmul](media/matmul.png)
+- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`

 ### Docs

--- a/build.zig
+++ b/build.zig
@@ -112,7 +112,6 @@ pub fn build(b: *std.build.Builder) !void {
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;

    const ggml = make.obj("ggml", "ggml.c");
-    const sgemm = make.obj("sgemm", "sgemm.cpp");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@@ -129,44 +128,15 @@ pub fn build(b: *std.build.Builder) !void {
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
-
-    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
-    for (server_assets) |asset| {
-        const input_path = b.fmt("examples/server/public/{s}", .{asset});
-        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
-
-        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
-
-        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
-        defer b.allocator.free(input);
-
-        var buf = std.ArrayList(u8).init(b.allocator);
-        defer buf.deinit();
-
-        for (input) |byte| {
-            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
-        }
-
-        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
-        defer b.allocator.free(name);
-        std.mem.replaceScalar(u8, name, '.', '_');
-
-        try std.fs.cwd().writeFile(output_path, b.fmt(
-            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
-            .{ name, buf.items, name, input.len },
-        ));
-
-        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
-    }
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -153,54 +153,6 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

-# test_scripts_debug
-
-function gg_run_test_scripts_debug {
-    cd ${SRC}
-
-    set -e
-
-    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-
-    set +e
-}
-
-function gg_sum_test_scripts_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs test scripts in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
-# test_scripts_release
-
-function gg_run_test_scripts_release {
-    cd ${SRC}
-
-    set -e
-
-    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
-
-    set +e
-}
-
-function gg_sum_test_scripts_release {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs test scripts in release mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
 function gg_get_model {
    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@@ -335,8 +287,7 @@ function gg_run_open_llama_3b_v2 {

    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -365,6 +316,47 @@ function gg_run_open_llama_3b_v2 {

    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log

+    # lora
+    function compare_ppl {
+        qnt="$1"
+        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
+            return 20
+        fi
+
+        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
+        return 0
+    }
+
+    path_lora="../models-mnt/open-llama/3B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
+
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+
+    python3 ../convert-lora-to-ggml.py ${path_lora}
+
+    # f16
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0 + f16 lora-base
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
    set +e
 }

@@ -375,6 +367,7 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -387,6 +380,11 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 # open_llama_7b_v2
@@ -470,10 +468,7 @@ function gg_run_open_llama_7b_v2 {

    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -502,6 +497,48 @@ function gg_run_open_llama_7b_v2 {

    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log

+    # lora
+    function compare_ppl {
+        qnt="$1"
+        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
+            return 20
+        fi
+
+        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
+        return 0
+    }
+
+    path_lora="../models-mnt/open-llama/7B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
+
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+
+    python3 ../convert-lora-to-ggml.py ${path_lora}
+
+    # f16
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # currently not supported by the CUDA backend
+    # q8_0
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0 + f16 lora-base
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
    set +e
 }

@@ -512,6 +549,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -524,6 +562,11 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 # bge-small
@@ -599,11 +642,6 @@ test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small

-    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts_debug
-        test $ret -eq 0 && gg_run test_scripts_release
-    fi
-
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -1,16 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-pc-windows-msvc )
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@@ -1,6 +0,0 @@
-set( CMAKE_SYSTEM_NAME Windows )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target arm64-pc-windows-msvc )
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,6 +47,9 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

+set(TARGET json-schema-to-grammar)
+add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
+
 set(TARGET common)

 add_library(${TARGET} STATIC
@@ -60,7 +63,6 @@ add_library(${TARGET} STATIC
    grammar-parser.h
    grammar-parser.cpp
    json.hpp
-    json-schema-to-grammar.cpp
    train.h
    train.cpp
    ngram-cache.h
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,8 +1,4 @@
 #include "common.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
-#include "json-schema-to-grammar.h"
 #include "llama.h"

 #include <algorithm>
@@ -69,16 +65,15 @@
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+#define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL

-using json = nlohmann::ordered_json;
-
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
    std::unordered_set<std::string> siblings;
    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+        std::ifstream thread_siblings("/sys/devices/system/cpu"
            + std::to_string(cpu) + "/topology/thread_siblings");
        if (!thread_siblings.is_open()) {
            break; // no more cpus
@@ -109,79 +104,6 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
-#include <pthread.h>
-
-static void cpuid(unsigned leaf, unsigned subleaf,
-                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
-    __asm__("movq\t%%rbx,%%rsi\n\t"
-            "cpuid\n\t"
-            "xchgq\t%%rbx,%%rsi"
-            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
-            : "0"(leaf), "2"(subleaf));
-}
-
-static int pin_cpu(int cpu) {
-    cpu_set_t mask;
-    CPU_ZERO(&mask);
-    CPU_SET(cpu, &mask);
-    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
-}
-
-static bool is_hybrid_cpu(void) {
-    unsigned eax, ebx, ecx, edx;
-    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
-    return !!(edx & (1u << 15));
-}
-
-static bool is_running_on_efficiency_core(void) {
-    unsigned eax, ebx, ecx, edx;
-    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
-    int intel_atom = 0x20;
-    int core_type = (eax & 0xff000000u) >> 24;
-    return core_type == intel_atom;
-}
-
-static int count_math_cpus(int cpu_count) {
-    int result = 0;
-    for (int cpu = 0; cpu < cpu_count; ++cpu) {
-        if (pin_cpu(cpu)) {
-            return -1;
-        }
-        if (is_running_on_efficiency_core()) {
-            continue; // efficiency cores harm lockstep threading
-        }
-        ++cpu; // hyperthreading isn't useful for linear algebra
-        ++result;
-    }
-    return result;
-}
-
-#endif // __x86_64__ && __linux__
-
-/**
- * Returns number of CPUs on system that are useful for math.
- */
-int get_math_cpu_count() {
-#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
-    int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
-    if (cpu_count < 1) {
-        return get_num_physical_cores();
-    }
-    if (is_hybrid_cpu()) {
-        cpu_set_t affinity;
-        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
-            int result = count_math_cpus(cpu_count);
-            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
-            if (result > 0) {
-                return result;
-            }
-        }
-    }
-#endif
-    return get_num_physical_cores();
-}
-
 void process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@@ -235,63 +157,15 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }

-bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
-    const char * sep = strchr(data, '=');
-    if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
-        return false;
-    }
-    llama_model_kv_override kvo;
-    std::strncpy(kvo.key, data, sep - data);
-    kvo.key[sep - data] = 0;
-    sep++;
-    if (strncmp(sep, "int:", 4) == 0) {
-        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-        kvo.val_i64 = std::atol(sep);
-    } else if (strncmp(sep, "float:", 6) == 0) {
-        sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-        kvo.val_f64 = std::atof(sep);
-    } else if (strncmp(sep, "bool:", 5) == 0) {
-        sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-        if (std::strcmp(sep, "true") == 0) {
-            kvo.val_bool = true;
-        } else if (std::strcmp(sep, "false") == 0) {
-            kvo.val_bool = false;
-        } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
-            return false;
-        }
-    } else if (strncmp(sep, "str:", 4) == 0) {
-        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-        if (strlen(sep) > 127) {
-            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
-            return false;
-        }
-        strncpy(kvo.val_str, sep, 127);
-        kvo.val_str[127] = '\0';
-    } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
-        return false;
-    }
-    overrides.emplace_back(std::move(kvo));
-    return true;
-}
-
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    llama_sampling_params & sparams = params.sparams;
+    llama_sampling_params& sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
-        // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
        params.seed = std::stoul(argv[i]);
-        sparams.seed = std::stoul(argv[i]);
        return true;
    }
    if (arg == "-t" || arg == "--threads") {
@@ -894,17 +768,13 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-        params.image.emplace_back(argv[i]);
+        params.image = argv[i];
        return true;
    }
    if (arg == "-i" || arg == "--interactive") {
        params.interactive = true;
        return true;
    }
-    if (arg == "--interactive-specials") {
-        params.interactive_specials = true;
-        return true;
-    }
    if (arg == "--embedding") {
        params.embedding = true;
        return true;
@@ -917,10 +787,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.instruct = true;
        return true;
    }
-    if (arg == "-cnv" || arg == "--conversation") {
-        params.conversation = true;
-        return true;
-    }
    if (arg == "-cml" || arg == "--chatml") {
        params.chatml = true;
        return true;
@@ -957,10 +823,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cont_batching = true;
        return true;
    }
-    if (arg == "-fa" || arg == "--flash-attn") {
-        params.flash_attn = true;
-        return true;
-    }
    if (arg == "--color") {
        params.use_color = true;
        return true;
@@ -1060,14 +922,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 #endif // GGML_USE_CUDA_SYCL_VULKAN
        return true;
    }
-    if (arg == "--rpc") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.rpc_servers = argv[i];
-        return true;
-    }
    if (arg == "--no-mmap") {
        params.use_mmap = false;
        return true;
@@ -1156,10 +1010,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_print = std::stoi(argv[i]);
        return true;
    }
-    if (arg == "--check-tensors") {
-        params.check_tensors = true;
-        return true;
-    }
    if (arg == "--ppl-output-type") {
        if (++i >= argc) {
            invalid_param = true;
@@ -1298,24 +1148,52 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        );
        return true;
    }
-    if (arg == "-j" || arg == "--json-schema") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
-        return true;
-    }
    if (arg == "--override-kv") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
-        if (!parse_kv_override(argv[i], params.kv_overrides)) {
+        char* sep = strchr(argv[i], '=');
+        if (sep == nullptr || sep - argv[i] >= 128) {
+            fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
+            invalid_param = true;
+            return true;
+        }
+        struct llama_model_kv_override kvo;
+        std::strncpy(kvo.key, argv[i], sep - argv[i]);
+        kvo.key[sep - argv[i]] = 0;
+        sep++;
+        if (strncmp(sep, "int:", 4) == 0) {
+            sep += 4;
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.int_value = std::atol(sep);
+        }
+        else if (strncmp(sep, "float:", 6) == 0) {
+            sep += 6;
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+            kvo.float_value = std::atof(sep);
+        }
+        else if (strncmp(sep, "bool:", 5) == 0) {
+            sep += 5;
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+            if (std::strcmp(sep, "true") == 0) {
+                kvo.bool_value = true;
+            }
+            else if (std::strcmp(sep, "false") == 0) {
+                kvo.bool_value = false;
+            }
+            else {
+                fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
+                invalid_param = true;
+                return true;
+            }
+        }
+        else {
            fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
            invalid_param = true;
            return true;
        }
+        params.kv_overrides.push_back(kvo);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
@@ -1345,29 +1223,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    return false;
 }

-void gpt_params_handle_model_default(gpt_params & params) {
-    if (!params.hf_repo.empty()) {
-        // short-hand to avoid specifying --hf-file -> default it to --model
-        if (params.hf_file.empty()) {
-            if (params.model.empty()) {
-                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
-            }
-            params.hf_file = params.model;
-        } else if (params.model.empty()) {
-            params.model = "models/" + string_split(params.hf_file, '/').back();
-        }
-    } else if (!params.model_url.empty()) {
-        if (params.model.empty()) {
-            auto f = string_split(params.model_url, '#').front();
-            f = string_split(f, '?').front();
-            f = string_split(f, '/').back();
-            params.model =  "models/" + f;
-        }
-    } else if (params.model.empty()) {
-        params.model = DEFAULT_MODEL_PATH;
-    }
-}
-
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
    std::string arg;
@@ -1379,12 +1234,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
+
        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
-        if (invalid_param) {
-            throw std::invalid_argument("error: invalid parameter for argument: " + arg);
-        }
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }

    if (params.prompt_cache_all &&
@@ -1394,7 +1251,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    gpt_params_handle_model_default(params);
+    // short-hand to avoid specifying --hf-file -> default it to --model
+    if (!params.hf_repo.empty() && params.hf_file.empty()) {
+        params.hf_file = params.model;
+    }

    if (params.escape) {
        process_escapes(params.prompt);
@@ -1432,9 +1292,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -h, --help            show this help message and exit\n");
    printf("  --version             show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
-    printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
-    printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
@@ -1495,9 +1353,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    printf("  --grammar-file FNAME  file to read grammar from\n");
-    printf("  -j SCHEMA, --json-schema SCHEMA\n");
-    printf("                        JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
-    printf("                        For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
    printf("  --cfg-negative-prompt PROMPT\n");
    printf("                        negative prompt to use for guidance. (default: empty)\n");
    printf("  --cfg-negative-prompt-file FNAME\n");
@@ -1535,9 +1390,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("  -fa, --flash-attn     enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
-    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models. Specify multiple times for batching\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
    if (llama_supports_mlock()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@@ -1565,7 +1419,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
        printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
        printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
    }
-    printf("  --rpc SERVERS         comma separated list of RPC servers\n");
    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
    printf("  -gan N, --grp-attn-n N\n");
@@ -1591,7 +1444,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --control-vector-layer-range START END\n");
    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
+    printf("                        model path (default: %s)\n", params.model.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
    printf("                        draft model for speculative decoding (default: unused)\n");
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
@@ -1608,10 +1461,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -ptc N, --print-token-count N\n");
    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
-    printf("  --check-tensors       check model tensor data for invalid values\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@@ -1736,18 +1588,6 @@ std::vector<std::string> string_split(std::string input, char separator) {
    return parts;
 }

-std::string string_strip(const std::string & str) {
-    size_t start = 0;
-    size_t end = str.size();
-    while (start < end && std::isspace(str[start])) {
-        start++;
-    }
-    while (end > start && std::isspace(str[end - 1])) {
-        end--;
-    }
-    return str.substr(start, end - start);
-}
-
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
        {"top_k",       llama_sampler_type::TOP_K},
@@ -1839,13 +1679,11 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
-    mparams.rpc_servers     = params.rpc_servers.c_str();
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
-    mparams.check_tensors   = params.check_tensors;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -1910,7 +1748,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;
    cparams.offload_kqv       = !params.no_kv_offload;
-    cparams.flash_attn        = params.flash_attn;

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -1941,75 +1778,59 @@ void llama_batch_add(

 #ifdef LLAMA_USE_CURL

-static bool starts_with(const std::string & str, const std::string & prefix) {
-    // While we wait for C++20's std::string::starts_with...
-    return str.rfind(prefix, 0) == 0;
-}
-
-static bool llama_download_file(const std::string & url, const std::string & path) {
-
-    // Initialize libcurl
-    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
+static bool llama_download_file(CURL * curl, const char * url, const char * path) {
    bool force_download = false;

    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl, CURLOPT_URL, url);
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);

 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
 #endif

    // Check if the file already exists locally
    struct stat model_file_info;
-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
+    auto file_exists = (stat(path, &model_file_info) == 0);

-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata;
-    std::string etag;
-    std::string last_modified;
+    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
+    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    char etag_path[PATH_MAX] = {0};
+    snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
+
+    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    char last_modified_path[PATH_MAX] = {0};
+    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);

    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
-                    if (previous_url != url) {
-                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
-                        return false;
-                    }
-                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-                return false;
+        auto * f_etag = fopen(etag_path, "r");
+        if (f_etag) {
+            if (!fgets(etag, sizeof(etag), f_etag)) {
+                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
+            } else {
+                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
            }
+            fclose(f_etag);
+        }
+
+        auto * f_last_modified = fopen(last_modified_path, "r");
+        if (f_last_modified) {
+            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
+                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
+            } else {
+                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
+                        last_modified);
+            }
+            fclose(f_last_modified);
        }
-    } else {
-        fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
    }

    // Send a HEAD request to retrieve the etag and last-modified headers
    struct llama_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
+        char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+        char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
    };
    llama_load_model_from_url_headers headers;
    {
@@ -2017,37 +1838,38 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;

-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+            // Convert header field name to lowercase
+            for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
+                buffer[i] = tolower(buffer[i]);
+            }

-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
+            const char * etag_prefix = "etag: ";
+            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
+                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
+            }
+
+            const char * last_modified_prefix = "last-modified: ";
+            if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
+                strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
+                        n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
            }
            return n_items;
        };

-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);

-        CURLcode res = curl_easy_perform(curl.get());
+        CURLcode res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
+            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return false;
        }

        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code != 200) {
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
@@ -2056,30 +1878,28 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        }
    }

-    bool should_download = !file_exists || force_download;
-    if (!should_download) {
-        if (!etag.empty() && etag != headers.etag) {
-            fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
-            should_download = true;
-        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
-            should_download = true;
-        }
-    }
+    // If the ETag or the Last-Modified headers are different: trigger a new download
+    bool should_download = !file_exists
+        || force_download
+        || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
+        || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
    if (should_download) {
-        std::string path_temporary = path + ".downloadInProgress";
+        char path_temporary[PATH_MAX] = {0};
+        snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
        if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
+            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
+            if (remove(path) != 0) {
+                curl_easy_cleanup(curl);
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
                return false;
            }
        }

        // Set the output file
-        std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
+        auto * outfile = fopen(path_temporary, "wb");
        if (!outfile) {
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            curl_easy_cleanup(curl);
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
            return false;
        }

@@ -2087,12 +1907,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
            return fwrite(data, size, nmemb, (FILE *)fd);
        };
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);

        //  display download progress
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
+        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);

        // helper function to hide password in URL
        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
@@ -2111,34 +1931,51 @@ static bool llama_download_file(const std::string & url, const std::string & pat

        // start the download
        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        auto res = curl_easy_perform(curl.get());
+                llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
+        auto res = curl_easy_perform(curl);
        if (res != CURLE_OK) {
+            fclose(outfile);
+            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return false;
        }

        long http_code = 0;
-        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code < 200 || http_code >= 400) {
+            fclose(outfile);
+            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
            return false;
        }

-        // Causes file to be closed explicitly here before we rename it.
-        outfile.reset();
+        // Clean up
+        fclose(outfile);

-        // Write the updated JSON metadata file.
-        metadata.update({
-            {"url", url},
-            {"etag", headers.etag},
-            {"lastModified", headers.last_modified}
-        });
-        std::ofstream(metadata_path) << metadata.dump(4);
-        fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+        // Write the new ETag to the .etag file
+        if (strlen(headers.etag) > 0) {
+            auto * etag_file = fopen(etag_path, "w");
+            if (etag_file) {
+                fputs(headers.etag, etag_file);
+                fclose(etag_file);
+                fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
+            }
+        }

-        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+        // Write the new lastModified to the .etag file
+        if (strlen(headers.last_modified) > 0) {
+            auto * last_modified_file = fopen(last_modified_path, "w");
+            if (last_modified_file) {
+                fputs(headers.last_modified, last_modified_file);
+                fclose(last_modified_file);
+                fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
+                        headers.last_modified);
+            }
+        }
+
+        if (rename(path_temporary, path) != 0) {
+            curl_easy_cleanup(curl);
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
            return false;
        }
    }
@@ -2156,7 +1993,15 @@ struct llama_model * llama_load_model_from_url(
        return NULL;
    }

-    if (!llama_download_file(model_url, path_model)) {
+    // Initialize libcurl
+    auto * curl = curl_easy_init();
+
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return NULL;
+    }
+
+    if (!llama_download_file(curl, model_url, path_model)) {
        return NULL;
    }

@@ -2170,6 +2015,7 @@ struct llama_model * llama_load_model_from_url(
        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
        if (!ctx_gguf) {
            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            curl_easy_cleanup(curl);
            return NULL;
        }

@@ -2181,6 +2027,8 @@ struct llama_model * llama_load_model_from_url(
        gguf_free(ctx_gguf);
    }

+    curl_easy_cleanup(curl);
+
    if (n_split > 1) {
        char split_prefix[PATH_MAX] = {0};
        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
@@ -2211,7 +2059,11 @@ struct llama_model * llama_load_model_from_url(
                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);

-                return llama_download_file(split_url, split_path);
+                auto * curl = curl_easy_init();
+                bool res = llama_download_file(curl, split_url, split_path);
+                curl_easy_cleanup(curl);
+
+                return res;
            }, idx));
        }

@@ -2386,12 +2238,12 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }

-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -2553,7 +2405,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
    size_t pos_start = 0;
    size_t pos_found = 0;

-    if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
+    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
@@ -2665,7 +2517,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
-    fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@@ -2699,7 +2550,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
-    fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
+    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
@@ -2734,7 +2585,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
--- a/common/common.h
+++ b/common/common.h
@@ -31,8 +31,6 @@
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

-#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const *LLAMA_COMMIT;
@@ -41,7 +39,6 @@ extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

-int get_math_cpu_count();
 int32_t get_num_physical_cores();

 //
@@ -51,7 +48,7 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

-    int32_t n_threads             = get_math_cpu_count();
+    int32_t n_threads             = get_num_physical_cores();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
@@ -82,20 +79,19 @@ struct gpt_params {
    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
-    std::string rpc_servers       = "";    // comma separated list of RPC servers

    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;

    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;

-    std::string model                = "";  // model path
+    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft          = "";  // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
    std::string model_url            = "";  // model url to download
@@ -136,13 +132,11 @@ struct gpt_params {
    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

-    bool   kl_divergence   = false; // compute KL divergence
+    bool   kl_divergence   = false; // compute KL-divergence

    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
-    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
-    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
@@ -153,7 +147,6 @@ struct gpt_params {
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
-    bool flash_attn        = false; // flash attention

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
@@ -167,20 +160,15 @@ struct gpt_params {
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
-    bool check_tensors     = false; // validate tensor data

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
-    std::vector<std::string> image; // path to image file(s)
+    std::string mmproj = ""; // path to multimodal projector
+    std::string image  = ""; // path to an image file
 };

-void gpt_params_handle_model_default(gpt_params & params);
-
-bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
-
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -204,7 +192,6 @@ bool validate_file_name(const std::string & filename);
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
-std::string string_strip(const std::string & str);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

 //
@@ -249,12 +236,11 @@ std::vector<llama_token> llama_tokenize(
                        bool   add_special,
                        bool   parse_special = false);

-// tokenizes a token into a piece, optionally renders special/control tokens
+// tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
        const struct llama_context * ctx,
-                       llama_token   token,
-                       bool          special = true);
+                       llama_token   token);

 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -26,7 +26,7 @@ namespace grammar_parser {

    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
+        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }

@@ -142,9 +142,6 @@ namespace grammar_parser {
                pos++;
                last_sym_start = out_elements.size();
                while (*pos != '"') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -159,9 +156,6 @@ namespace grammar_parser {
                }
                last_sym_start = out_elements.size();
                while (*pos != ']') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    enum llama_gretype type = last_sym_start < out_elements.size()
@@ -170,9 +164,6 @@ namespace grammar_parser {

                    out_elements.push_back({type, char_pair.first});
                    if (pos[0] == '-' && pos[1] != ']') {
-                        if (!pos[1]) {
-                            throw std::runtime_error("unexpected end of input");
-                        }
                        auto endchar_pair = parse_char(pos + 1);
                             pos          = endchar_pair.second;
                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,101 +11,35 @@

 using json = nlohmann::ordered_json;

-template <typename Iterator>
-static std::string join(Iterator begin, Iterator end, const std::string & separator);
-
-static std::string repeat(const std::string & str, size_t n);
-
-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
-    if (separator_rule.empty()) {
-        if (min_items == 0 && max_items == 1) {
-            return item_rule + "?";
-        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
-            return item_rule + "+";
-        }
-    }
-
-    std::string result;
-    if (min_items > 0) {
-        if (item_rule_is_literal && separator_rule.empty()) {
-            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
-        } else {
-            std::vector<std::string> items(min_items, item_rule);
-            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
-        }
-    }
-
-    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
-        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
-
-        if (up_to_n == 0) {
-            return "";
-        } else if (up_to_n == 1) {
-            return "(" + content + ")?";
-        } else if (!separator_rule.empty() && !prefix_with_sep) {
-            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
-        } else {
-            std::string res = repeat("(" + content + " ", up_to_n);
-            // strip trailing space
-            res = res.substr(0, res.length() - 1);
-            res += repeat(")?", up_to_n);
-            return res;
-        }
-    };
-
-    if (min_items > 0 && max_items != min_items) {
-        result += " ";
-    }
-
-    if (max_items != std::numeric_limits<int>::max()) {
-        result += opt_repetitions(max_items - min_items, min_items > 0);
-    } else {
-        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
-        if (min_items == 0 && !separator_rule.empty()) {
-            result = "(" + item_rule + " " + item_operator + "*)?";
-        } else {
-            result += item_operator + "*";
-        }
-    }
-
-    return result;
-}
-
 const std::string SPACE_RULE = "\" \"?";

-struct BuiltinRule {
-    std::string content;
-    std::vector<std::string> deps;
+std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
+    {"boolean", "(\"true\" | \"false\") space"},
+    {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
+    {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
+    {"value", "object | array | string | number | boolean"},
+    {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
+    {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
+    {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
+    {"string", " \"\\\"\" (\n"
+               "        [^\"\\\\] |\n"
+               "        \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
+               "      )* \"\\\"\" space"},
+    {"null", "\"null\" space"}
 };
+std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};

-const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
-
-std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
-    {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
-    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
-    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
-    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
-    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
-    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
-    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
-    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
-    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
-    {"null", {"\"null\" space", {}}},
-};
-
-std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
-    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
-    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
-    {"date-time", {"date \"T\" time", {"date", "time"}}},
-    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
-    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
-    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
+std::unordered_map<std::string, std::string> DATE_RULES = {
+    {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
+    {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
+    {"date-time", "date \"T\" time"},
+    {"date-string", "\"\\\"\" date \"\\\"\" space"},
+    {"time-string", "\"\\\"\" time \"\\\"\" space"},
+    {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
 };

 static bool is_reserved_name(const std::string & name) {
@@ -113,7 +47,7 @@ static bool is_reserved_name(const std::string & name) {
    if (RESERVED_NAMES.empty()) {
        RESERVED_NAMES.insert("root");
        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
    }
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
@@ -258,7 +192,7 @@ private:
                if (_dotall) {
                    rule = "[\\U00000000-\\U0010FFFF]";
                } else {
-                    rule = "[^\\x0A\\x0D]";
+                    rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
                }
                return _add_rule("dot", rule);
            };
@@ -272,7 +206,7 @@ private:
                    if (literal.empty()) {
                        return false;
                    }
-                    ret.emplace_back(literal, true);
+                    ret.push_back(std::make_pair(literal, true));
                    literal.clear();
                    return true;
                };
@@ -298,7 +232,7 @@ private:
            while (i < length) {
                char c = sub_pattern[i];
                if (c == '.') {
-                    seq.emplace_back(get_dot(), false);
+                    seq.push_back(std::make_pair(get_dot(), false));
                    i++;
                } else if (c == '(') {
                    i++;
@@ -307,7 +241,7 @@ private:
                            _warnings.push_back("Unsupported pattern syntax");
                        }
                    }
-                    seq.emplace_back("(" + to_rule(transform()) + ")", false);
+                    seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
                } else if (c == ')') {
                    i++;
                    if (start > 0 && sub_pattern[start - 1] != '(') {
@@ -331,9 +265,9 @@ private:
                    }
                    square_brackets += ']';
                    i++;
-                    seq.emplace_back(square_brackets, false);
+                    seq.push_back(std::make_pair(square_brackets, false));
                } else if (c == '|') {
-                    seq.emplace_back("|", false);
+                    seq.push_back(std::make_pair("|", false));
                    i++;
                } else if (c == '*' || c == '+' || c == '?') {
                    seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@@ -374,21 +308,47 @@ private:
                    auto &sub = last.first;
                    auto sub_is_literal = last.second;

-                    if (!sub_is_literal) {
-                        std::string & sub_id = sub_rule_ids[sub];
-                        if (sub_id.empty()) {
-                            sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
+                    if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
+                        sub += "*";
+                    } else if (min_times == 0 && max_times == 1) {
+                        sub += "?";
+                    } else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
+                        sub += "+";
+                    } else {
+                        if (!sub_is_literal) {
+                            std::string & sub_id = sub_rule_ids[sub];
+                            if (sub_id.empty()) {
+                                sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
+                            }
+                            sub = sub_id;
                        }
-                        sub = sub_id;
+                        std::string result;
+                        if (sub_is_literal && min_times > 0) {
+                            result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
+                        } else {
+                            for (int j = 0; j < min_times; j++) {
+                                if (j > 0) {
+                                    result += " ";
+                                }
+                                result += sub;
+                            }
+                        }
+                        if (min_times > 0 && min_times < max_times) {
+                            result += " ";
+                        }
+                        if (max_times == std::numeric_limits<int>::max()) {
+                            result += sub + "*";
+                        } else {
+                            for (int j = min_times; j < max_times; j++) {
+                                if (j > min_times) {
+                                    result += " ";
+                                }
+                                result += sub + "?";
+                            }
+                        }
+                        seq.back().first = result;
+                        seq.back().second = false;
                    }
-                    seq.back().first = build_repetition(
-                        sub_is_literal ? "\"" + sub + "\"" : sub,
-                        min_times,
-                        max_times,
-                        "",
-                        sub_is_literal
-                    );
-                    seq.back().second = false;
                } else {
                    std::string literal;
                    auto is_non_literal = [&](char c) {
@@ -417,7 +377,7 @@ private:
                        }
                    }
                    if (!literal.empty()) {
-                        seq.emplace_back(literal, true);
+                        seq.push_back(std::make_pair(literal, true));
                    }
                }
            }
@@ -464,7 +424,7 @@ private:
        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
            prop_kv_rule_names["*"] = kv_rule;
            optional_props.push_back("*");
        }
@@ -526,25 +486,6 @@ private:
        return rule;
    }

-    std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
-        auto n = _add_rule(name, rule.content);
-        for (const auto & dep : rule.deps) {
-            BuiltinRule dep_rule;
-            auto it = PRIMITIVE_RULES.find(dep);
-            if (it == PRIMITIVE_RULES.end()) {
-                it = STRING_FORMAT_RULES.find(dep);
-                if (it == STRING_FORMAT_RULES.end()) {
-                    _errors.push_back("Rule " + dep + " not known");
-                    continue;
-                }
-            }
-            if (_rules.find(dep) == _rules.end()) {
-                _add_primitive(dep, it->second);
-            }
-        }
-        return n;
-    }
-
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
@@ -706,33 +647,49 @@ public:
                return _add_rule(rule_name, rule);
            } else {
                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
+                std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
+                std::string successive_items;
                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
-                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
-
-                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
+                if (min_items > 0) {
+                    successive_items += repeat(list_item_operator, min_items - 1);
+                    min_items--;
+                }
+                if (max_items >= 0 && max_items > min_items) {
+                    successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
+                } else {
+                    successive_items += list_item_operator + "*";
+                }
+                std::string rule;
+                if (min_items == 0) {
+                    rule =  "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
+                } else {
+                    rule =  "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
+                }
+                return _add_rule(rule_name, rule);
            }
        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
-            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
-        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
-            auto prim_name = schema_format + "-string";
-            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
-        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
-            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
-            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
-            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
-            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
+            return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+        } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
+            for (const auto & kv : DATE_RULES) {
+                _add_rule(kv.first, kv.second);
+            }
+            return schema_format + "-string";
        } else if (schema.empty() || schema_type == "object") {
-            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
+            for (const auto & n : OBJECT_RULE_NAMES) {
+                _add_rule(n, PRIMITIVE_RULES.at(n));
+            }
+            return _add_rule(rule_name, "object");
        } else {
            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
                _errors.push_back("Unrecognized schema: " + schema.dump());
                return "";
            }
            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+            return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
        }
    }

--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,8 +1,4 @@
 #pragma once
-
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/common/log.h
+++ b/common/log.h
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #else
        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
+        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
    #endif
 #else
    #define LOG_TEE_FLF_FMT "%s"
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
    #define LOG_IMPL(str, ...)                                                                                      \
    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
+#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
-#if !defined(_MSC_VER) || defined(__clang__)
+#ifndef _MSC_VER
    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
-#if !defined(_MSC_VER) || defined(__clang__)
+#ifndef _MSC_VER
    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
 #endif

 // LOG macro variants with auto endline.
-#if !defined(_MSC_VER) || defined(__clang__)
+#ifndef _MSC_VER
    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,6 +1,4 @@
-#define LLAMA_API_INTERNAL
 #include "sampling.h"
-#include <random>

 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
    struct llama_sampling_context * result = new llama_sampling_context();
@@ -35,10 +33,6 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

    result->prev.resize(params.n_prev);

-    result->n_valid = 0;
-
-    llama_sampling_set_rng_seed(result, params.seed);
-
    return result;
 }

@@ -66,14 +60,6 @@ void llama_sampling_reset(llama_sampling_context * ctx) {

    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
    ctx->cur.clear();
-    ctx->n_valid = 0;
-}
-
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
-    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = std::random_device{}();
-    }
-    ctx->rng.seed(seed);
 }

 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
@@ -217,7 +203,7 @@ static llama_token llama_sampling_sample_impl(

            sampler_queue(ctx_main, params, cur_p, min_keep);

-            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);
+            id = llama_sample_token(ctx_main, &cur_p);

            //{
            //    const int n_top = 10;
@@ -256,8 +242,6 @@ static llama_token llama_sampling_sample_impl(
        }
    }

-    ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
-
    return id;
 }

--- a/common/sampling.h
+++ b/common/sampling.h
@@ -4,10 +4,9 @@

 #include "grammar-parser.h"

-#include <random>
 #include <string>
-#include <unordered_map>
 #include <vector>
+#include <unordered_map>

 // sampler types
 enum class llama_sampler_type : char {
@@ -21,26 +20,25 @@ enum class llama_sampler_type : char {

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;                 // number of previous tokens to remember
-    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
-    float       top_p                 = 0.95f;              // 1.0 = disabled
-    float       min_p                 = 0.05f;              // 0.0 = disabled
-    float       tfs_z                 = 1.00f;              // 1.0 = disabled
-    float       typical_p             = 1.00f;              // 1.0 = disabled
-    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
-    float       penalty_freq          = 0.00f;              // 0.0 = disabled
-    float       penalty_present       = 0.00f;              // 0.0 = disabled
-    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;              // target entropy
-    float       mirostat_eta          = 0.10f;              // learning rate
-    bool        penalize_nl           = false;              // consider newlines as a repeatable token
-    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+    int32_t     n_prev                = 64;       // number of previous tokens to remember
+    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t     top_k                 = 40;       // <= 0 to use vocab size
+    float       top_p                 = 0.95f;    // 1.0 = disabled
+    float       min_p                 = 0.05f;    // 0.0 = disabled
+    float       tfs_z                 = 1.00f;    // 1.0 = disabled
+    float       typical_p             = 1.00f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.00f;    // 1.0 = disabled
+    float       penalty_freq          = 0.00f;    // 0.0 = disabled
+    float       penalty_present       = 0.00f;    // 0.0 = disabled
+    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;    // target entropy
+    float       mirostat_eta          = 0.10f;    // learning rate
+    bool        penalize_nl           = false;     // consider newlines as a repeatable token

    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
@@ -81,9 +79,6 @@ struct llama_sampling_context {
    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
-    size_t n_valid; // Number of correct top tokens with correct probabilities.
-
-    std::mt19937 rng;
 };

 #include "common.h"
@@ -98,9 +93,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
 // - reset grammar
 void llama_sampling_reset(llama_sampling_context * ctx);

-// Set the sampler seed
-void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
-
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,325 +0,0 @@
-#!/usr/bin/env python3
-
-# This script downloads the tokenizer models of the specified models from Huggingface and
-# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
-#
-# This is necessary in order to analyze the type of pre-tokenizer used by the model and
-# provide the necessary information to llama.cpp via the GGUF header in order to implement
-# the same pre-tokenizer.
-#
-# ref: https://github.com/ggerganov/llama.cpp/pull/6920
-#
-# Instructions:
-#
-# - Add a new model to the "models" list
-# - Run the script with your huggingface token:
-#
-#   python3 convert-hf-to-gguf-update.py <huggingface_token>
-#
-# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
-# - Update llama.cpp with the new pre-tokenizer if necessary
-#
-# TODO: generate tokenizer tests for llama.cpp
-#
-
-import logging
-import os
-import pathlib
-import re
-
-import requests
-import sys
-import json
-
-from hashlib import sha256
-from enum import IntEnum, auto
-from transformers import AutoTokenizer
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger("convert-hf-to-gguf-update")
-sess = requests.Session()
-
-
-class TOKENIZER_TYPE(IntEnum):
-    SPM = auto()
-    BPE = auto()
-    WPM = auto()
-
-
-# TODO: this string has to exercise as much pre-tokenizer functionality as possible
-#       will be updated with time - contributions welcome
-chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
-
-if len(sys.argv) == 2:
-    token = sys.argv[1]
-    if not token.startswith("hf_"):
-        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
-        sys.exit(1)
-else:
-    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
-    sys.exit(1)
-
-# TODO: add models here, base models preferred
-models = [
-    {"name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
-    {"name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
-    {"name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
-    {"name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
-    {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
-    {"name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
-    {"name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
-    {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
-    {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
-    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
-    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
-    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
-]
-
-
-def download_file_with_auth(url, token, save_path):
-    headers = {"Authorization": f"Bearer {token}"}
-    response = sess.get(url, headers=headers)
-    response.raise_for_status()
-    os.makedirs(os.path.dirname(save_path), exist_ok=True)
-    with open(save_path, 'wb') as f:
-        f.write(response.content)
-    logger.info(f"File {save_path} downloaded successfully")
-
-
-def download_model(model):
-    name = model["name"]
-    repo = model["repo"]
-    tokt = model["tokt"]
-
-    os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
-
-    files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
-    if tokt == TOKENIZER_TYPE.SPM:
-        files.append("tokenizer.model")
-
-    for file in files:
-        save_path = f"models/tokenizers/{name}/{file}"
-        if os.path.isfile(save_path):
-            logger.info(f"{name}: File {save_path} already exists - skipping")
-            continue
-        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
-
-
-for model in models:
-    try:
-        download_model(model)
-    except Exception as e:
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
-
-
-# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
-
-src_ifs = ""
-for model in models:
-    name = model["name"]
-    tokt = model["tokt"]
-
-    if tokt == TOKENIZER_TYPE.SPM:
-        continue
-
-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
-    # create the tokenizer
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
-        continue  # Skip to the next model if the tokenizer can't be loaded
-
-    chktok = tokenizer.encode(chktxt)
-    chkhsh = sha256(str(chktok).encode()).hexdigest()
-
-    logger.info(f"model: {name}")
-    logger.info(f"tokt: {tokt}")
-    logger.info(f"repo: {model['repo']}")
-    logger.info(f"chktok: {chktok}")
-    logger.info(f"chkhsh: {chkhsh}")
-
-    # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
-        cfg = json.load(f)
-        normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
-        pre_tokenizer = cfg["pre_tokenizer"]
-        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
-        if "ignore_merges" in cfg["model"]:
-            logger.info("ignore_merges: " + json.dumps(cfg["model"]["ignore_merges"], indent=4))
-
-    logger.info("")
-
-    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
-    src_ifs += f"            # ref: {model['repo']}\n"
-    src_ifs += f"            res = \"{name}\"\n"
-
-src_func = f"""
-    def get_vocab_base_pre(self, tokenizer) -> str:
-        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
-        # is specific for the BPE pre-tokenizer used by the model
-        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
-        # use in llama.cpp to implement the same pre-tokenizer
-
-        chktxt = {repr(chktxt)}
-
-        chktok = tokenizer.encode(chktxt)
-        chkhsh = sha256(str(chktok).encode()).hexdigest()
-
-        logger.debug(f"chktok: {{chktok}}")
-        logger.debug(f"chkhsh: {{chkhsh}}")
-
-        res = None
-
-        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
-        #       or pull the latest version of the model from Huggingface
-        #       don't edit the hashes manually!
-{src_ifs}
-        if res is None:
-            logger.warning("\\n")
-            logger.warning("**************************************************************************************")
-            logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
-            logger.warning("**          There are 2 possible reasons for this:")
-            logger.warning("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
-            logger.warning("**          - the pre-tokenization config has changed upstream")
-            logger.warning("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
-            logger.warning("**")
-            logger.warning(f"** chkhsh:  {{chkhsh}}")
-            logger.warning("**************************************************************************************")
-            logger.warning("\\n")
-            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
-
-        logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
-        logger.debug(f"chkhsh: {{chkhsh}}")
-
-        return res
-"""
-
-convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
-convert_py = convert_py_pth.read_text()
-convert_py = re.sub(
-    r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
-    lambda m: m.group(1) + src_func + m.group(3),
-    convert_py,
-    flags=re.DOTALL | re.MULTILINE,
-)
-
-convert_py_pth.write_text(convert_py)
-
-logger.info("+++ convert-hf-to-gguf.py was updated")
-
-# generate tests for each tokenizer model
-
-tests = [
-    "ied 4 ½ months",
-    "Führer",
-    "",
-    " ",
-    "  ",
-    "   ",
-    "\t",
-    "\n",
-    "\n\n",
-    "\n\n\n",
-    "\t\n",
-    "Hello world",
-    " Hello world",
-    "Hello World",
-    " Hello World",
-    " Hello World!",
-    "Hello, world!",
-    " Hello, world!",
-    " this is 🦙.cpp",
-    "w048 7tuijk dsdfhu",
-    "нещо на Български",
-    "កាន់តែពិសេសអាចខលចេញ",
-    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-    "Hello",
-    " Hello",
-    "  Hello",
-    "   Hello",
-    "    Hello",
-    "    Hello\n    Hello",
-    " (",
-    "\n =",
-    "' era",
-    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-    "3",
-    "33",
-    "333",
-    "3333",
-    "33333",
-    "333333",
-    "3333333",
-    "33333333",
-    "333333333",
-    # "Cửa Việt", # llama-bpe fails on this
-    chktxt,
-]
-
-# write the tests to ./models/ggml-vocab-{name}.gguf.inp
-# the format is:
-#
-# test0
-# __ggml_vocab_test__
-# test1
-# __ggml_vocab_test__
-# ...
-#
-
-# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
-# for each test, write the resulting tokens on a separate line
-
-for model in models:
-    name = model["name"]
-    tokt = model["tokt"]
-
-    # Skip if the tokenizer folder does not exist or there are other download issues previously
-    if not os.path.exists(f"models/tokenizers/{name}"):
-        logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
-        continue
-
-    # create the tokenizer
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except OSError as e:
-        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
-        continue  # Skip this model and continue with the next one in the loop
-
-    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
-        for text in tests:
-            f.write(f"{text}")
-            f.write("\n__ggml_vocab_test__\n")
-
-    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
-        for text in tests:
-            res = tokenizer.encode(text, add_special_tokens=False)
-            for r in res:
-                f.write(f" {r}")
-            f.write("\n")
-
-    logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
-
-# generate commands for creating vocab files
-
-logger.info("\nRun the following commands to generate the vocab files for testing:\n")
-
-for model in models:
-    name = model["name"]
-
-    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
-
-logger.info("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations

-import logging
 import argparse
 import os
 import struct
@@ -15,8 +14,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

-logger = logging.getLogger("ggml-to-gguf")
-

 class GGMLFormat(IntEnum):
    GGML = 0
@@ -128,6 +125,7 @@ class Tensor:
        self.start_offset = offset
        self.len_bytes = n_bytes
        offset += n_bytes
+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset


@@ -177,7 +175,7 @@ class GGMLModel:
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
-        logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
+        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
        self.validate_conversion(hp.ftype)
        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
        offset += vocab.load(data, offset, hp.n_vocab)
@@ -217,12 +215,12 @@ class GGMLToGGUF:
                    if float(hp.n_head) / float(x) == gqa:
                        n_kv_head = x
                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
-                logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
        self.n_kv_head = n_kv_head
        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)

    def save(self):
-        logger.info('* Preparing to save GGUF file')
+        print('* Preparing to save GGUF file')
        gguf_writer = gguf.GGUFWriter(
            self.cfg.output,
            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@@ -232,11 +230,11 @@ class GGMLToGGUF:
        if self.special_vocab is not None:
            self.special_vocab.add_to_gguf(gguf_writer)
        self.add_tensors(gguf_writer)
-        logger.info("    gguf: write header")
+        print("    gguf: write header")
        gguf_writer.write_header_to_file()
-        logger.info("    gguf: write metadata")
+        print("    gguf: write metadata")
        gguf_writer.write_kv_data_to_file()
-        logger.info("    gguf: write tensors")
+        print("    gguf: write tensors")
        gguf_writer.write_tensors_to_file()
        gguf_writer.close()

@@ -252,7 +250,7 @@ class GGMLToGGUF:
            name = cfg.name if cfg.name is not None else cfg.input.name
        except UnicodeDecodeError:
            name = None
-        logger.info('* Adding model parameters and KV items')
+        print('* Adding model parameters and KV items')
        if name is not None:
            gguf_writer.add_name(name)
        gguf_writer.add_description(desc)
@@ -283,13 +281,12 @@ class GGMLToGGUF:
    def add_vocab(self, gguf_writer):
        hp = self.model.hyperparameters
        gguf_writer.add_tokenizer_model('llama')
-        gguf_writer.add_tokenizer_pre('default')
        tokens = []
        scores = []
        toktypes = []
        if self.vocab_override is not None:
            vo = self.vocab_override
-            logger.info('* Adding vocab item(s)')
+            print('* Adding vocab item(s)')
            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
                tokens.append(vbytes)
                scores.append(score)
@@ -301,7 +298,7 @@ class GGMLToGGUF:
            if len(toktypes) > 0:
                gguf_writer.add_token_types(toktypes)
            return
-        logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
+        print(f'* Adding {hp.n_vocab} vocab item(s)')
        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
            tt = 1 # Normal
@@ -336,7 +333,7 @@ class GGMLToGGUF:
    def add_tensors(self, gguf_writer):
        tensor_map = self.name_map
        data = self.data
-        logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
        for tensor in self.model.tensors:
            name = str(tensor.name, 'UTF-8')
            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@@ -346,6 +343,7 @@ class GGMLToGGUF:
                temp = tempdims[1]
                tempdims[1] = tempdims[0]
                tempdims[0] = temp
+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
            gguf_writer.add_tensor(
                mapped_name,
                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@@ -402,35 +400,33 @@ def handle_args():
                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
    parser.add_argument("--vocabtype", default="spm,hfft",
                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
-    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
    return parser.parse_args()


 def main():
    cfg = handle_args()
-    logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
-    logger.info(f'* Using config: {cfg}')
-    logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
+    print(f'* Using config: {cfg}')
+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
    if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
+        print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
    data = np.memmap(cfg.input, mode = 'r')
    model = GGMLModel()
-    logger.info('* Scanning GGML input file')
+    print('* Scanning GGML input file')
    offset = model.load(data, 0)  # noqa
-    logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
    special_vocab = None
    if cfg.model_metadata_dir is not None:
        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
-        logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
-        logger.info(f'* Overriding params: {params_override}')
-        logger.info(f'* Overriding vocab: {vocab_override}')
-        logger.info(f'* Special vocab: {special_vocab}')
+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        print(f'* Overriding params: {params_override}')
+        print(f'* Overriding vocab: {vocab_override}')
+        print(f'* Special vocab: {special_vocab}')
    else:
-        logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
        if model.file_format == GGMLFormat.GGML:
-            logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
+            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
    converter = GGMLToGGUF(
        model, data, cfg,
        params_override = params_override,
@@ -438,7 +434,7 @@ def main():
        special_vocab = special_vocab
    )
    converter.save()
-    logger.info(f'* Successful completion. Output saved to: {cfg.output}')
+    print(f'* Successful completion. Output saved to: {cfg.output}')


 if __name__ == '__main__':
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any, BinaryIO, Sequence
+
+import numpy as np
+import torch
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
+
+
+def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
+    fout.write(b"ggla"[::-1])  # magic (ggml lora)
+    fout.write(struct.pack("i", 1))  # file version
+    fout.write(struct.pack("i", params["r"]))
+    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
+    # but some models ship a float value instead
+    # let's convert to int, but fail if lossless conversion is not possible
+    assert (
+        int(params["lora_alpha"]) == params["lora_alpha"]
+    ), "cannot convert float to int losslessly"
+    fout.write(struct.pack("i", int(params["lora_alpha"])))
+
+
+def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
+    sname = name.encode("utf-8")
+    fout.write(
+        struct.pack(
+            "iii",
+            len(shape),
+            len(sname),
+            NUMPY_TYPE_TO_FTYPE[data_type.name],
+        )
+    )
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print(f"Usage: python {sys.argv[0]} <path> [arch]")
+        print(
+            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+        )
+        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
+        sys.exit(1)
+
+    input_json = os.path.join(sys.argv[1], "adapter_config.json")
+    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+
+    if os.path.exists(input_model):
+        model = torch.load(input_model, map_location="cpu")
+    else:
+        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
+        # lazy import load_file only if lora is in safetensors format.
+        from safetensors.torch import load_file
+        model = load_file(input_model, device="cpu")
+
+    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+
+    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
+        print(f"Error: unsupported architecture {arch_name}")
+        sys.exit(1)
+
+    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
+    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+
+    with open(input_json, "r") as f:
+        params = json.load(f)
+
+    if params["peft_type"] != "LORA":
+        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+        sys.exit(1)
+
+    if params["fan_in_fan_out"] is True:
+        print("Error: param fan_in_fan_out is not supported")
+        sys.exit(1)
+
+    if params["bias"] is not None and params["bias"] != "none":
+        print("Error: param bias is not supported")
+        sys.exit(1)
+
+    # TODO: these seem to be layers that have been trained but without lora.
+    # doesn't seem widely used but eventually should be supported
+    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+        print("Error: param modules_to_save is not supported")
+        sys.exit(1)
+
+    with open(output_path, "wb") as fout:
+        fout.truncate()
+
+        write_file_header(fout, params)
+        for k, v in model.items():
+            orig_k = k
+            if k.endswith(".default.weight"):
+                k = k.replace(".default.weight", ".weight")
+            if k in ["llama_proj.weight", "llama_proj.bias"]:
+                continue
+            if k.endswith("lora_A.weight"):
+                if v.dtype != torch.float16 and v.dtype != torch.float32:
+                    v = v.float()
+                v = v.T
+            else:
+                v = v.float()
+
+            t = v.detach().numpy()
+
+            prefix = "base_model.model."
+            if k.startswith(prefix):
+                k = k[len(prefix) :]
+
+            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
+            if k.endswith(lora_suffixes):
+                suffix = k[-len(lora_suffixes[0]):]
+                k = k[: -len(lora_suffixes[0])]
+            else:
+                print(f"Error: unrecognized tensor name {orig_k}")
+                sys.exit(1)
+
+            tname = name_map.get_name(k)
+            if tname is None:
+                print(f"Error: could not map tensor name {orig_k}")
+                print(" Note: the arch parameter must be specified if the model is not llama")
+                sys.exit(1)
+
+            if suffix == ".lora_A.weight":
+                tname += ".weight.loraA"
+            elif suffix == ".lora_B.weight":
+                tname += ".weight.loraB"
+            else:
+                assert False
+
+            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+            write_tensor_header(fout, tname, t.shape, t.dtype)
+            t.tofile(fout)
+
+    print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations

-import logging
 import argparse
 import os
 import sys
@@ -15,8 +14,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf

-logger = logging.getLogger("persimmon-to-gguf")
-

 def _flatten_dict(dct, tensors, prefix=None):
    assert isinstance(dct, dict)
@@ -33,9 +30,9 @@ def _flatten_dict(dct, tensors, prefix=None):

 def _get_sentencepiece_tokenizer_info(dir_model: Path):
    tokenizer_path = dir_model / 'adept_vocab.model'
-    logger.info('getting sentencepiece tokenizer from', tokenizer_path)
+    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
    tokenizer = SentencePieceProcessor(str(tokenizer_path))
-    logger.info('adding tokens')
+    print('gguf: adding tokens')
    tokens: list[bytes] = []
    scores: list[float] = []
    toktypes: list[int] = []
@@ -70,10 +67,8 @@ def main():
    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
-    parser.add_argument("--adept-inference-dir", type=str,  help="path to adept-inference code directory")
-    parser.add_argument("--verbose",  action="store_true",  help="increase output verbosity")
+    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
    args = parser.parse_args()
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    sys.path.append(str(args.adept_inference_dir))
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
@@ -104,7 +99,6 @@ def main():

    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
    gguf_writer.add_tokenizer_model('llama')
-    gguf_writer.add_tokenizer_pre('default')
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
@@ -112,7 +106,7 @@ def main():
    gguf_writer.add_eos_token_id(71013)

    tensor_map = gguf.get_tensor_name_map(arch, block_count)
-    logger.info(tensor_map)
+    print(tensor_map)
    for name in tensors.keys():
        data_torch = tensors[name]
        if name.endswith(".self_attention.rotary_emb.inv_freq"):
@@ -122,21 +116,22 @@ def main():
        data = data_torch.to(torch.float32).squeeze().numpy()
        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
        if new_name is None:
-            raise ValueError(f"Can not map tensor '{name}'")
-
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
        n_dims = len(data.shape)
-        logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
        gguf_writer.add_tensor(new_name, data)
-    logger.info("gguf: write header")
+    print("gguf: write header")
    gguf_writer.write_header_to_file()
-    logger.info("gguf: write metadata")
+    print("gguf: write metadata")
    gguf_writer.write_kv_data_to_file()
-    logger.info("gguf: write tensors")
+    print("gguf: write tensors")
    gguf_writer.write_tensors_to_file()

    gguf_writer.close()

-    logger.info(f"gguf: model successfully exported to '{args.outfile}'")
+    print(f"gguf: model successfully exported to '{args.outfile}'")
+    print("")


 if __name__ == '__main__':
--- a/convert.py
+++ b/convert.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import annotations

-import logging
 import argparse
 import concurrent.futures
 import enum
@@ -24,7 +23,7 @@ from abc import ABC, abstractmethod
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable

 import numpy as np
 from sentencepiece import SentencePieceProcessor
@@ -36,8 +35,6 @@ import gguf
 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias

-logger = logging.getLogger("convert")
-
 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)

@@ -284,7 +281,6 @@ class Params:
        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None
-        n_ff = None

        # hack to determine LLaMA v1 vs v2 vs CodeLlama
        if config.get("moe"):
@@ -309,8 +305,6 @@ class Params:
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6

-        assert n_ff is not None
-
        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
@@ -344,47 +338,10 @@ class Params:
        return params


-@dataclass
-class Metadata:
-    name: Optional[str] = None
-    author: Optional[str] = None
-    version: Optional[str] = None
-    url: Optional[str] = None
-    description: Optional[str] = None
-    licence: Optional[str] = None
-    source_url: Optional[str] = None
-    source_hf_repo: Optional[str] = None
-
-    @staticmethod
-    def load(metadata_path: Path) -> Metadata:
-        if metadata_path is None or not metadata_path.exists():
-            return Metadata()
-
-        with open(metadata_path, 'r') as file:
-            data = json.load(file)
-
-        # Create a new Metadata instance
-        metadata = Metadata()
-
-        # Assigning values to Metadata attributes if they exist in the JSON file
-        # This is based on LLM_KV_NAMES mapping in llama.cpp
-        metadata.name = data.get("general.name")
-        metadata.author = data.get("general.author")
-        metadata.version = data.get("general.version")
-        metadata.url = data.get("general.url")
-        metadata.description = data.get("general.description")
-        metadata.license = data.get("general.license")
-        metadata.source_url = data.get("general.source.url")
-        metadata.source_hf_repo = data.get("general.source.huggingface.repository")
-
-        return metadata
-
-
 #
 # vocab
 #

-
@runtime_checkable
 class BaseVocab(Protocol):
    tokenizer_model: ClassVar[str]
@@ -502,8 +459,7 @@ class SentencePieceVocab(Vocab):
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')

-        self.sentencepiece_tokenizer = SentencePieceProcessor()
-        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()

        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@@ -523,23 +479,23 @@ class SentencePieceVocab(Vocab):
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.IdToPiece(i)
+            piece = tokenizer.id_to_piece(i)
            text         = piece.encode("utf-8")
-            score: float = tokenizer.GetScore(i)
+            score: float = tokenizer.get_score(i)

            toktype = gguf.TokenType.NORMAL
-            if tokenizer.IsUnknown(i):
+            if tokenizer.is_unknown(i):
                toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.IsControl(i):
+            if tokenizer.is_control(i):
                toktype = gguf.TokenType.CONTROL

            # NOTE: I think added_tokens are user defined.
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED

-            if tokenizer.IsUnused(i):
+            if tokenizer.is_unused(i):
                toktype = gguf.TokenType.UNUSED
-            if tokenizer.IsByte(i):
+            if tokenizer.is_byte(i):
                toktype = gguf.TokenType.BYTE

            yield text, score, toktype
@@ -569,14 +525,7 @@ class LlamaHfVocab(Vocab):

        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        is_llama3 = (
-            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
-            and not tokenizer_model.get('byte_fallback', True)
-        )
-        if is_llama3:
-            raise TypeError('Llama 3 must be converted with BpeVocab')
-
-        if not is_llama3 and (
+        if (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
@@ -687,6 +636,7 @@ class LlamaHfVocab(Vocab):


 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
+    # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
    if n_head_kv is not None and n_head != n_head_kv:
        n_head = n_head_kv
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@@ -947,7 +897,7 @@ class LazyUnpickler(pickle.Unpickler):
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
+    CLASSES = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -1076,12 +1026,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)

    # Check for a vocab size mismatch
    if params.n_vocab == vocab.vocab_size:
-        logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
+        print("Ignoring added_tokens.json since model matches vocab size without it.")
        return

    if pad_vocab and params.n_vocab > vocab.vocab_size:
        pad_count = params.n_vocab - vocab.vocab_size
-        logger.debug(
+        print(
            f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
        )
        for i in range(1, pad_count + 1):
@@ -1103,42 +1053,21 @@ class OutputFile:
    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

-    def add_meta_model(self, params: Params, metadata: Metadata) -> None:
-        # Metadata About The Model And Its Provenence
-        name = "LLaMA"
-        if metadata is not None and metadata.name is not None:
-            name = metadata.name
-        elif params.path_model is not None:
-            name = params.path_model.name
-        elif params.n_ctx == 4096:
-            # Heuristic detection of LLaMA v2 model
-            name = "LLaMA v2"
-
-        self.gguf.add_name(name)
-
-        if metadata is not None:
-            if metadata.author is not None:
-                self.gguf.add_author(metadata.author)
-            if metadata.version is not None:
-                self.gguf.add_version(metadata.version)
-            if metadata.url is not None:
-                self.gguf.add_url(metadata.url)
-            if metadata.description is not None:
-                self.gguf.add_description(metadata.description)
-            if metadata.licence is not None:
-                self.gguf.add_licence(metadata.licence)
-            if metadata.source_url is not None:
-                self.gguf.add_source_url(metadata.source_url)
-            if metadata.source_hf_repo is not None:
-                self.gguf.add_source_hf_repo(metadata.source_hf_repo)
-
    def add_meta_arch(self, params: Params) -> None:
-        # Metadata About The Neural Architecture Itself
-        self.gguf.add_vocab_size(params.n_vocab)
-        self.gguf.add_context_length(params.n_ctx)
-        self.gguf.add_embedding_length(params.n_embd)
-        self.gguf.add_block_count(params.n_layer)
-        self.gguf.add_feed_forward_length(params.n_ff)
+        name = "LLaMA"
+
+        # TODO: better logic to determine model name
+        if params.n_ctx == 4096:
+            name = "LLaMA v2"
+        elif params.path_model is not None:
+            name = str(params.path_model.parent).split('/')[-1]
+
+        self.gguf.add_name                (name)
+        self.gguf.add_vocab_size          (params.n_vocab)
+        self.gguf.add_context_length      (params.n_ctx)
+        self.gguf.add_embedding_length    (params.n_embd)
+        self.gguf.add_block_count         (params.n_layer)
+        self.gguf.add_feed_forward_length (params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
@@ -1230,7 +1159,7 @@ class OutputFile:
            elapsed = time.time() - start
            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
-            logger.info(
+            print(
                f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
            )
            self.gguf.write_tensor_data(ndarray)
@@ -1241,14 +1170,13 @@ class OutputFile:
    @staticmethod
    def write_vocab_only(
        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
-        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
        of.add_meta_special_vocab(svocab)
@@ -1275,14 +1203,12 @@ class OutputFile:
        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
-        metadata: Metadata = None,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

        # meta data
-        of.add_meta_model(params, metadata)
        of.add_meta_arch(params)
        if isinstance(vocab, Vocab):
            of.add_meta_vocab(vocab)
@@ -1318,37 +1244,6 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
    raise ValueError(f"Unexpected combination of types: {name_to_type}")


-def model_parameter_count(model: LazyModel) -> int:
-    total_model_parameters = 0
-    for i, (name, lazy_tensor) in enumerate(model.items()):
-        sum_weights_in_tensor = 1
-        for dim in lazy_tensor.shape:
-            sum_weights_in_tensor *= dim
-        total_model_parameters += sum_weights_in_tensor
-    return total_model_parameters
-
-
-def model_parameter_count_rounded_notation(model_params_count: int) -> str:
-    if model_params_count > 1e12 :
-        # Trillions Of Parameters
-        scaled_model_params = model_params_count * 1e-12
-        scale_suffix = "T"
-    elif model_params_count > 1e9 :
-        # Billions Of Parameters
-        scaled_model_params = model_params_count * 1e-9
-        scale_suffix = "B"
-    elif model_params_count > 1e6 :
-        # Millions Of Parameters
-        scaled_model_params = model_params_count * 1e-6
-        scale_suffix = "M"
-    else:
-        # Thousands Of Parameters
-        scaled_model_params = model_params_count * 1e-3
-        scale_suffix = "K"
-
-    return f"{round(scaled_model_params)}{scale_suffix}"
-
-
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
            for (name, tensor) in model.items()}
@@ -1379,12 +1274,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
    # HF models permut or pack some of the tensors, so we need to undo that
    for i in itertools.count():
        if f"model.layers.{i}.self_attn.q_proj.weight" in model:
-            logger.debug(f"Permuting layer {i}")
+            print(f"Permuting layer {i}")
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
            # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
-            logger.debug(f"Unpacking and permuting layer {i}")
+            print(f"Unpacking and permuting layer {i}")
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@@ -1397,15 +1292,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
        if name_new is None:
            if skip_unknown:
-                logger.warning(f"Unexpected tensor name: {name} - skipping")
+                print(f"Unexpected tensor name: {name} - skipping")
                continue
            raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")

        if tensor_type in should_skip:
-            logger.debug(f"skipping tensor {name_new}")
+            print(f"skipping tensor {name_new}")
            continue

-        logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
+        print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
        out[name_new] = lazy_tensor

    return out
@@ -1470,7 +1365,7 @@ def load_some_model(path: Path) -> ModelPlus:
    paths = find_multifile_paths(path)
    models_plus: list[ModelPlus] = []
    for path in paths:
-        logger.info(f"Loading model file {path}")
+        print(f"Loading model file {path}")
        models_plus.append(lazy_load_file(path))

    model_plus = merge_multifile_models(models_plus)
@@ -1511,7 +1406,7 @@ class VocabFactory:
        else:
            raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")

-        logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
+        print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
        return vocab

    def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
@@ -1528,49 +1423,27 @@ class VocabFactory:
        return vocab, special_vocab


-def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
-    quantization = {
-        GGMLFileType.AllF32:    "F32",
-        GGMLFileType.MostlyF16: "F16",
-        GGMLFileType.MostlyQ8_0: "Q8_0",
+def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+    namestr = {
+        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.MostlyF16: "f16",
+        GGMLFileType.MostlyQ8_0:"q8_0",
    }[file_type]
-
-    parameters = model_parameter_count_rounded_notation(model_params_count)
-
-    expert_count = ""
-    if params.n_experts is not None:
-        expert_count = f"{params.n_experts}x"
-
-    version = ""
-    if metadata is not None and metadata.version is not None:
-        version = f"-{metadata.version}"
-
-    name = "ggml-model"
-    if metadata is not None and metadata.name is not None:
-        name = metadata.name
-    elif params.path_model is not None:
-        name = params.path_model.name
-
-    return f"{name}{version}-{expert_count}{parameters}-{quantization}"
-
-
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
-    default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
-    ret = model_paths[0].parent / f"{default_filename}.gguf"
+    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
-        logger.error(
+        sys.stderr.write(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.")
+            "Please explicitly specify a path using --outfile.\n")
        sys.exit(1)
    return ret


 def do_dump_model(model_plus: ModelPlus) -> None:
-    print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
-    print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
-    print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
+    print(f"model_plus.paths = {model_plus.paths!r}")
+    print(f"model_plus.format = {model_plus.format!r}")
+    print(f"model_plus.vocab = {model_plus.vocab!r}")
    for name, lazy_tensor in model_plus.model.items():
-        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
+        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")


 def main(args_in: list[str] | None = None) -> None:
@@ -1593,31 +1466,8 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--big-endian",   action="store_true",    help="model is executed on big endian machine")
    parser.add_argument("--pad-vocab",    action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")
    parser.add_argument("--skip-unknown", action="store_true",    help="skip unknown tensor names instead of failing")
-    parser.add_argument("--verbose",      action="store_true",    help="increase output verbosity")
-    parser.add_argument("--metadata",     type=Path,              help="Specify the path for a metadata file")
-    parser.add_argument("--get-outfile",  action="store_true",    help="get calculated default outfile name")

    args = parser.parse_args(args_in)
-
-    if args.verbose:
-        logging.basicConfig(level=logging.DEBUG)
-    elif args.dump_single or args.dump or args.get_outfile:
-        # Avoid printing anything besides the dump output
-        logging.basicConfig(level=logging.WARNING)
-    else:
-        logging.basicConfig(level=logging.INFO)
-
-    metadata = Metadata.load(args.metadata)
-
-    if args.get_outfile:
-        model_plus = load_some_model(args.model)
-        params = Params.load(model_plus)
-        model   = convert_model_names(model_plus.model, params, args.skip_unknown)
-        model_params_count = model_parameter_count(model_plus.model)
-        ftype   = pick_output_type(model, args.outtype)
-        print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
-        return
-
    if args.no_vocab and args.vocab_only:
        raise ValueError("--vocab-only does not make sense with --no-vocab")

@@ -1631,38 +1481,32 @@ def main(args_in: list[str] | None = None) -> None:
    else:
        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)

-    model_params_count = model_parameter_count(model_plus.model)
-    logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
-
    if args.dump:
        do_dump_model(model_plus)
        return
-
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG

-    params = None
-    if args.pad_vocab or not args.vocab_only:
-        params = Params.load(model_plus)
-        if params.n_ctx == -1:
-            if args.ctx is None:
-                msg = """\
-                    The model doesn't have a context size, and you didn't specify one with --ctx
-                    Please specify one with --ctx:
-                     - LLaMA v1: --ctx 2048
-                     - LLaMA v2: --ctx 4096"""
-                parser.error(textwrap.dedent(msg))
-            params.n_ctx = args.ctx
+    params = Params.load(model_plus)
+    if params.n_ctx == -1:
+        if args.ctx is None:
+            msg = """\
+                The model doesn't have a context size, and you didn't specify one with --ctx
+                Please specify one with --ctx:
+                 - LLaMA v1: --ctx 2048
+                 - LLaMA v2: --ctx 4096"""
+            parser.error(textwrap.dedent(msg))
+        params.n_ctx = args.ctx

-        if args.outtype:
-            params.ftype = {
-                "f32": GGMLFileType.AllF32,
-                "f16": GGMLFileType.MostlyF16,
-                "q8_0": GGMLFileType.MostlyQ8_0,
-            }[args.outtype]
+    if args.outtype:
+        params.ftype = {
+            "f32": GGMLFileType.AllF32,
+            "f16": GGMLFileType.MostlyF16,
+            "q8_0": GGMLFileType.MostlyQ8_0,
+        }[args.outtype]

-        logger.info(f"params = {params}")
+    print(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1675,39 +1519,29 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        if params is None:
-            params = Params(
-                n_vocab    = vocab.vocab_size,
-                n_embd     = 1,
-                n_layer    = 1,
-                n_ctx      = 1,
-                n_ff       = 1,
-                n_head     = 1,
-                n_head_kv  = 1,
-                f_norm_eps = 1e-5,
-            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
-        logger.info(f"Wrote {outfile}")
+                                    endianess=endianess, pad_vocab=args.pad_vocab)
+        print(f"Wrote {outfile}")
        return

    if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
        vocab = model_plus.vocab

-    logger.info(f"Vocab info: {vocab}")
-    logger.info(f"Special vocab info: {special_vocab}")
+    print(f"Vocab info: {vocab}")
+    print(f"Special vocab info: {special_vocab}")
+
    model   = model_plus.model
    model   = convert_model_names(model, params, args.skip_unknown)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)

    params.ftype = ftype
-    logger.info(f"Writing {outfile}, format {ftype}")
+    print(f"Writing {outfile}, format {ftype}")

    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
-    logger.info(f"Wrote {outfile}")
+                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+    print(f"Wrote {outfile}")


 if __name__ == '__main__':
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -23,7 +23,7 @@ Install BLIS:
 sudo make install
 ```

-We recommend using openmp since it's easier to modify the cores being used.
+We recommend using openmp since it's easier to modify the cores been used.

 ### llama.cpp compilation

--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc

 This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.

-Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.

-When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
+When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.

 Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).

--- a/docs/debugging-tests.md
+++ b/docs/debugging-tests.md
@@ -1,104 +0,0 @@
-# Debugging Tests Tips
-
-## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
-
-There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
-
-For example, running the following command will output an interactive list from which you can select a test. It takes this form:
-
-`debug-test.sh [OPTION]... <test_regex> <test_number>`
-
-It will then build & run in the debugger for you.
-
-To just execute a test and get back a PASS or FAIL message run:
-
-```bash
-./scripts/debug-test.sh test-tokenizer
-```
-
-To test in GDB use the `-g` flag to enable gdb test mode.
-
-```bash
-./scripts/debug-test.sh -g test-tokenizer
-
-# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
->>> b main
-```
-
-To speed up the testing loop, if you know your test number you can just run it similar to below:
-
-```bash
-./scripts/debug-test.sh test 23
-```
-
-For further reference use `debug-test.sh -h` to print help.
-
-&nbsp;
-
-### How does the script work?
-If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
-
-#### Step 1: Reset and Setup folder context
-
-From base of this repository, let's create `build-ci-debug` as our build context.
-
-```bash
-rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
-```
-
-#### Step 2: Setup Build Environment and Compile Test Binaries
-
-Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
-
-```bash
-cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
-make -j
-```
-
-#### Step 3: Find all tests available that matches REGEX
-
-The output of this command will give you the command & arguments needed to run GDB.
-
-* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
-* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
-* `-V` : Verbose Mode
-
-```bash
-ctest -R "test-tokenizer" -V -N
-```
-
-This may return output similar to below (focusing on key lines to pay attention to):
-
-```bash
-...
-1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
-1: Working Directory: .
-Labels: main
-  Test  #1: test-tokenizer-0-llama-spm
-...
-4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
-4: Working Directory: .
-Labels: main
-  Test  #4: test-tokenizer-0-falcon
-...
-```
-
-#### Step 4: Identify Test Command for Debugging
-
-So for test #1 above we can tell these two pieces of relevant information:
-* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
-* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
-
-#### Step 5: Run GDB on test command
-
-Based on the ctest 'test command' report above we can then run a gdb session via this command below:
-
-```bash
-gdb --args ${Test Binary} ${Test GGUF Model}
-```
-
-Example:
-
-```bash
-gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
-```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -13,6 +13,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(baby-llama)
+    add_subdirectory(test-bench)
    add_subdirectory(batched)
    add_subdirectory(batched-bench)
    add_subdirectory(beam-search)
@@ -49,7 +50,4 @@ else()
        add_subdirectory(server)
    endif()
    add_subdirectory(export-lora)
-    if (LLAMA_RPC)
-        add_subdirectory(rpc)
-    endif()
 endif()
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,7 +32,7 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
@@ -41,7 +41,6 @@ int main(int argc, char ** argv) {
    int n_kv_max     = 2048;
    int n_batch      = 2048;
    int n_ubatch     = 512;
-    bool flash_attn  = false;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;

@@ -67,27 +66,23 @@ int main(int argc, char ** argv) {
    }

    if (argc >= 6) {
-        flash_attn = std::atoi(argv[5]);
+        is_pp_shared = std::atoi(argv[5]);
    }

    if (argc >= 7) {
-        is_pp_shared = std::atoi(argv[6]);
+        n_gpu_layers = std::atoi(argv[6]);
    }

    if (argc >= 8) {
-        n_gpu_layers = std::atoi(argv[7]);
+        n_pp = parse_list(argv[7]);
    }

    if (argc >= 9) {
-        n_pp = parse_list(argv[8]);
+        n_tg = parse_list(argv[8]);
    }

    if (argc >= 10) {
-        n_tg = parse_list(argv[9]);
-    }
-
-    if (argc >= 11) {
-        n_pl = parse_list(argv[10]);
+        n_pl = parse_list(argv[9]);
    }

    // init LLM
@@ -113,11 +108,10 @@ int main(int argc, char ** argv) {

    llama_context_params ctx_params = llama_context_default_params();

-    ctx_params.seed       = 1234;
-    ctx_params.n_ctx      = n_kv_max;
-    ctx_params.n_batch    = n_batch;
-    ctx_params.n_ubatch   = n_ubatch;
-    ctx_params.flash_attn = flash_attn;
+    ctx_params.seed      = 1234;
+    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.n_batch   = n_batch;
+    ctx_params.n_ubatch  = n_ubatch;

    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -175,7 +169,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream? -> mark the stream as finished
-        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
+        if new_token_id == llama_token_eos(model) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
@@ -237,8 +237,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
            model,
            token,
            &result,
-            Int32(result.count),
-            false
+            Int32(result.count)
        )
        assert(check == actualTokensCount)
    } else {
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {

            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of generation? -> mark the stream as finished
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            // is it an end of stream? -> mark the stream as finished
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
 }

 // Function matching type llama_beam_search_callback_fn_t.
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -146,7 +146,7 @@ int main(int argc, char ** argv)  {
        /* no_alloc   =*/ 0
    };

-    ctx = ggml_init(params);
+    int main_gpu = 0;
    if (!ctx) {
        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
        return 1;
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -2,7 +2,7 @@

 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.

-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:

 `$ make -j`

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -49,12 +49,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
        }

        float * out = output + batch.seq_id[i][0] * n_embd;
-        //TODO: I would also add a parameter here to enable normalization or not.
-        /*fprintf(stdout, "unnormalized_embedding:");
-        for (int hh = 0; hh < n_embd; hh++) {
-            fprintf(stdout, "%9.6f ", embd[hh]);
-        }
-        fprintf(stdout, "\n");*/
        llama_embd_normalize(embd, out, n_embd);
    }
 }
@@ -129,12 +123,10 @@ int main(int argc, char ** argv) {
        inputs.push_back(inp);
    }

-    // check if the last token is SEP
-    // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
+    // add SEP if not present
    for (auto & inp : inputs) {
        if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            inp.push_back(llama_token_sep(model));
        }
    }

@@ -211,7 +203,6 @@ int main(int argc, char ** argv) {

    // clean up
    llama_print_timings(ctx);
-    llama_batch_free(batch);
    llama_free(ctx);
    llama_free_model(model);
    llama_backend_free();
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -28,50 +28,40 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
 }

 static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
-    GGML_ASSERT(n > 0);
    float sum = 0;
    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
        printf("                                     [\n");
-        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
-            if (i2 == n && ne[2] > 2*n) {
-                printf("                                      ..., \n");
-                i2 = ne[2] - n;
-            }
+        for (int64_t i2 = 0; i2 < ne[2] && i2 < n; i2++) {
            printf("                                      [\n");
-            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
-                if (i1 == n && ne[1] > 2*n) {
-                    printf("                                       ..., \n");
-                    i1 = ne[1] - n;
-                }
+            for (int64_t i1 = 0; i1 < ne[1] && i1 < n; i1++) {
                printf("                                       [");
-                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
-                    if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
-                        i0 = ne[0] - n;
-                    }
+                for (int64_t i0 = 0; i0 < ne[0] && i0 < n; i0++) {
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) &data[i];
+                        v = *(float *) data + i;
                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) &data[i];
+                        v = (float) *(int32_t *) data + i;
                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) &data[i];
+                        v = (float) *(int16_t *) data + i;
                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) &data[i];
+                        v = (float) *(int8_t *) data + i;
                    } else {
                        GGML_ASSERT(false);
                    }
-                    printf("%12.4f", v);
+                    printf("%8.4f", v);
                    sum += v;
-                    if (i0 < ne[0] - 1) printf(", ");
+                    if (i0 < ne[0] - 1 && i0 < n - 1) printf(", ");
                }
+                if (ne[0] > n) printf(", ...");
                printf("],\n");
            }
+            if (ne[1] > n) printf("                                       ...\n");
            printf("                                      ],\n");
        }
+        if (ne[2] > n) printf("                                     ...\n");
        printf("                                     ]\n");
        printf("                                     sum = %f\n", sum);
    }
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);

    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
+        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@@ -5,6 +5,5 @@ CLI to split / merge GGUF files.
 **Command line options:**

 - `--split`: split GGUF to multiple GGUF, default operation.
- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
 - `--split-max-tensors`: maximum tensors in each split: default(128)
 - `--merge`: merge multiple GGUF to a single GGUF.
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -32,7 +32,6 @@ struct split_params {
    int n_split_tensors = 128;
    std::string input;
    std::string output;
-    bool no_tensor_first_split = false;
    bool dry_run = false;
 };

@@ -50,7 +49,6 @@ static void split_print_usage(const char * executable) {
    printf("  --merge                 merge multiple GGUF to a single GGUF\n");
    printf("  --split-max-tensors     max tensors in each split (default: %d)\n", default_params.n_split_tensors);
    printf("  --split-max-size N(M|G) max size per split\n");
-    printf("  --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
    printf("  --dry-run               only print out a split plan and exit, without writing any new files\n");
    printf("\n");
 }
@@ -61,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024; // megabytes
+        n_bytes = n * 1024 * 1024; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = n * 1024 * 1024 * 1024; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
@@ -102,10 +100,6 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
            arg_found = true;
            params.dry_run = true;
        }
-        if (arg == "--no-tensor-first-split") {
-            arg_found = true;
-            params.no_tensor_first_split = true;
-        }

        if (is_op_set) {
            throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@@ -206,10 +200,10 @@ struct split_strategy {
        // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
        int i_split = -1;
        struct gguf_context * ctx_out = NULL;
-        auto new_ctx_out = [&](bool allow_no_tensors) {
+        auto new_ctx_out = [&]() {
            i_split++;
            if (ctx_out != NULL) {
-                if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
+                if (gguf_get_n_tensors(ctx_out) == 0) {
                    fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
                    exit(EXIT_FAILURE);
                }
@@ -226,12 +220,7 @@ struct split_strategy {
        };

        // initialize ctx_out for the first split
-        new_ctx_out(false);
-
-        // skip first split if no_tensor_first_split is set
-        if (params.no_tensor_first_split) {
-            new_ctx_out(true);
-        }
+        new_ctx_out();

        // process tensors one by one
        size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@@ -241,7 +230,7 @@ struct split_strategy {
            size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
            size_t next_tensors_size = curr_tensors_size + n_bytes;
            if (should_split(i, next_tensors_size)) {
-                new_ctx_out(false);
+                new_ctx_out();
                curr_tensors_size = n_bytes;
            } else {
                curr_tensors_size = next_tensors_size;
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-if [ $# -lt 1 ]
-then
-    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
-    echo "example: $0 ../../build/bin ../../tmp"
-    exit 1
-fi
-
-if [ $# -gt 1 ]
-then
-    TMP_DIR=$2
-else
-    TMP_DIR=/tmp
-fi
-
-set -x
-
-SPLIT=$1/gguf-split
-MAIN=$1/main
-WORK_PATH=$TMP_DIR/gguf-split
-ROOT_DIR=$(realpath $(dirname $0)/../../)
-
-mkdir -p "$WORK_PATH"
-
-# Clean up in case of previously failed test
-rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
-
-# 1. Get a model
-(
-cd $WORK_PATH
-"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
-)
-echo PASS
-
-# 2. Split with max tensors strategy
-$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
-echo PASS
-echo
-
-# 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
-echo PASS
-echo
-
-# 3. Merge
-$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
-echo PASS
-echo
-
-# 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
-echo PASS
-echo
-
-# 4. Split with no tensors in the first split
-$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
-echo PASS
-echo
-
-# 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
-echo PASS
-echo
-
-# 5. Merge
-#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
-#echo PASS
-#echo
-
-# 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
-#echo PASS
-#echo
-
-# 6. Split with size strategy
-$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
-echo PASS
-echo
-
-# 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
-echo PASS
-echo
-
-# Clean up
-rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -21,12 +21,12 @@ not have to be performed at all.
 ### Running the example
 Download a Grit model:
 ```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
 ```

 Run the example using the downloaded model:
 ```console
-$ ./gritlm -m models/gritlm-7b_q4_1.gguf
+$ ./gritlm -m gritlm-7b_q4_1.gguf

 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -19,12 +19,10 @@

 struct Stats {
    std::vector<float> values;
-    std::vector<int> counts;
    int ncall = 0;
 };

 struct StatParams {
-    std::string dataset;
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
@@ -46,9 +44,9 @@ private:
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
-    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
+    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
-    void save_imatrix(const char * file_name, const char * dataset) const;
+    void save_imatrix(const char * file_name) const;
    void keep_imatrix(int ncall) const;
 };

@@ -83,7 +81,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    if (ask) {
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
-        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
        return true;
@@ -104,56 +101,43 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
-        //   ids  -> [n_experts_used, n_tokens]
-        //   src1 -> [cols, n_expert_used, n_tokens]
+        const int idx  = ((int32_t *) t->op_params)[0];
        const ggml_tensor * ids = t->src[2];
        const int n_as = src0->ne[2];
-        const int n_ids = ids->ne[0];

        // the top-k selected expert ids are stored in the ids tensor
        // for simplicity, always copy ids to host, because it is small
-        // take into account that ids is not contiguous!
-
-        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
-
-        m_ids.resize(ggml_nbytes(ids));
+        GGML_ASSERT(ids->ne[1] == src1->ne[1]);
+        m_ids.resize(ggml_nbytes(ids)/sizeof(int));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));

        auto & e = m_stats[wname];

        ++e.ncall;
+        // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+        //       using the following line, we can correct for that if needed by replacing the line above with:
+        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

-        if (e.values.empty()) {
-            e.values.resize(src1->ne[0]*n_as, 0);
-            e.counts.resize(src1->ne[0]*n_as, 0);
-        }
-        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-            exit(1); //GGML_ASSERT(false);
-        }
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
-        }
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
-
-            for (int idx = 0; idx < n_ids; ++idx) {
-                for (int row = 0; row < (int)src1->ne[2]; ++row) {
-                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
-
-                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
-
-                    if (excur != ex) continue;
-
-                    const int64_t i11 = idx % src1->ne[1];
-                    const int64_t i12 = row;
-                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
-
-                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                        e.values[e_start + j] += x[j]*x[j];
-                        e.counts[e_start + j]++;
-                    }
+            if (e.values.empty()) {
+                e.values.resize(src1->ne[0]*n_as, 0);
+            }
+            else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+                exit(1); //GGML_ASSERT(false);
+            }
+            if (m_params.verbosity > 1) {
+                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            }
+            for (int row = 0; row < (int)src1->ne[1]; ++row) {
+                const int excur = m_ids[row*n_as + idx];
+                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+                if (excur != ex) continue;
+                const float * x = data + row * src1->ne[0];
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    e.values[e_start + j] += x[j]*x[j];
                }
            }
            if (e.ncall > m_last_call) {
@@ -170,7 +154,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        auto& e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
-            e.counts.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@@ -184,7 +167,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            const float * x = data + row * src1->ne[0];
            for (int j = 0; j < (int)src1->ne[0]; ++j) {
                e.values[j] += x[j]*x[j];
-                e.counts[j]++;
            }
        }
        if (e.ncall > m_last_call) {
@@ -202,7 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }

 void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
 }

 void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -210,39 +192,24 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
+    save_imatrix(file_name.c_str());
 }

-void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
+void IMatrixCollector::save_imatrix(const char * fname) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
-    out.write((const char *) &n_entries, sizeof(n_entries));
-    for (const auto & p : m_stats) {
+    out.write((const char*)&n_entries, sizeof(n_entries));
+    for (auto& p : m_stats) {
        int len = p.first.size();
-        out.write((const char *) &len, sizeof(len));
+        out.write((const char*)&len, sizeof(len));
        out.write(p.first.c_str(), len);
-        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
+        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
        int nval = p.second.values.size();
-        out.write((const char *) &nval, sizeof(nval));
-        if (nval > 0) {
-            std::vector<float> tmp(nval);
-            for (int i = 0; i < nval; i++) {
-                tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
-            }
-            out.write((const char*)tmp.data(), nval*sizeof(float));
-        }
+        out.write((const char*)&nval, sizeof(nval));
+        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
    }
-
-    // Write the number of call the matrix was computed with
-    out.write((const char *) &m_last_call, sizeof(m_last_call));
-
-    // Write the dataset name at the end of the file to later on specify it in quantize
-    int n_dataset = strlen(dataset);
-    out.write((const char *) &n_dataset, sizeof(n_dataset));
-    out.write(dataset, n_dataset);
-
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
    }
 }

@@ -278,28 +245,14 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
            imatrix_data = {};
            return false;
        }
-
-        // When re-called from load_imatrix() with add set, this will already be created.
-        if (e.values.empty()) {
-            e.values.resize(nval, 0);
-            e.counts.resize(nval, 0);
-        }
-
-        std::vector<float> tmp(nval);
-        in.read((char*)tmp.data(), nval*sizeof(float));
+        e.values.resize(nval);
+        in.read((char*)e.values.data(), nval*sizeof(float));
        if (in.fail()) {
            printf("%s: failed reading data for entry %d\n",__func__,i);
            imatrix_data = {};
            return false;
        }
-
-        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
-        for (int i = 0; i < nval; i++) {
-            e.values[i] += tmp[i];
-            e.counts[i] += ncall;
-        }
-        e.ncall += ncall;
-
+        e.ncall = ncall;
    }
    return true;
 }
@@ -579,29 +532,6 @@ int main(int argc, char ** argv) {
        }
    }

-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
-        return 1;
-    }
-
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));

    if (!combine_files.empty()) {
@@ -640,6 +570,28 @@ int main(int argc, char ** argv) {
        }
    }

+    gpt_params params;
+    params.n_batch = 512;
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
+        return 1;
+    }
+
+    params.logits_all = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+    print_build_info();
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
    llama_backend_init();
    llama_numa_init(params.numa);

--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -36,11 +36,6 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi

 ### Example

-Download a model that supports infill, for example CodeLlama:
-```console
-scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
-```
-
 ```bash
 ./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -586,7 +586,7 @@ int main(int argc, char ** argv) {

            // deal with eot token in infill mode
            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
-                if (is_interacting && !params.interactive_first) {
+                if(is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
@@ -651,8 +651,8 @@ int main(int argc, char ** argv) {
                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
-            // deal with end of generation tokens in interactive mode
-            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
+            // deal with end of text token in interactive mode
+            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -731,8 +731,8 @@ int main(int argc, char ** argv) {
            }
        }

-        // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
+        // end of text token
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
            break;
        }

--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -6,94 +6,37 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union

-def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
-    if not separator_rule:
-        if min_items == 0 and max_items == 1:
-            return f'{item_rule}?'
-        elif min_items == 1 and max_items is None:
-            return f'{item_rule}+'
-
-    result = ''
-
-    if min_items > 0:
-        if item_rule_is_literal and separator_rule is None:
-            result = '"' + (item_rule[1:-1] * min_items) + '"'
-        else:
-            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
-
-    def opt_repetitions(up_to_n, prefix_with_sep=False):
-        '''
-            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
-            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
-            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
-        '''
-
-        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
-        if up_to_n == 0:
-            return ''
-        elif up_to_n == 1:
-            return f'({content})?'
-        elif separator_rule and not prefix_with_sep:
-            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
-        else:
-            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
-
-    if min_items > 0 and max_items != min_items:
-        result += ' '
-
-    if max_items is not None:
-        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
-    else:
-        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
-
-        if min_items == 0 and separator_rule:
-            result = f'({item_rule} {item_operator}*)?'
-        else:
-            result += f'{item_operator}*'
-
-    return result
-
-
-class BuiltinRule:
-    def __init__(self, content: str, deps: list = None):
-        self.content = content
-        self.deps = deps or []
-
-_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
-
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'

 PRIMITIVE_RULES = {
-    'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
-    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
-    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
-    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
-    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
-    'null'         : BuiltinRule('"null" space', []),
+    'boolean': '("true" | "false") space',
+    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
+    'value'  : 'object | array | string | number | boolean',
+    'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+    'array'  : '"[" space ( value ("," space value)* )? "]" space',
+    'uuid'   : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
+    'string': r''' "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space''',
+    'null': '"null" space',
 }
+OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']

 # TODO: support "uri", "email" string formats
-STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
-    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
-    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
-    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
-    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+DATE_RULES = {
+    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+    'date-time': 'date "T" time',
+    'date-string': '"\\"" date "\\"" space',
+    'time-string': '"\\"" time "\\"" space',
+    'date-time-string': '"\\"" date-time "\\"" space',
 }

-DOTALL = '[\\U00000000-\\U0010FFFF]'
-DOT = '[^\\x0A\\x0D]'
-
-RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])
+RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])

 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
 GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -103,6 +46,8 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
 NON_LITERAL_SET = set('|.()[]{}*+?')
 ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')

+DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
+TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits

 class SchemaConverter:
    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -110,9 +55,7 @@ class SchemaConverter:
        self._allow_fetch = allow_fetch
        self._dotall = dotall
        self._raw_pattern = raw_pattern
-        self._rules = {
-            'space': SPACE_RULE,
-        }
+        self._rules = {'space': SPACE_RULE}
        self._refs = {}
        self._refs_being_resolved = set()

@@ -122,29 +65,6 @@ class SchemaConverter:
        )
        return f'"{escaped}"'

-    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
-        '''
-            not_literal('a') -> '[^a]'
-            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
-        '''
-        assert len(literal) > 0, 'Empty literal not supported'
-        def recurse(i: int):
-            c = literal[i]
-            if maybe_escaped_underscores and c == '_':
-                yield f'[^{c}\\\\]'
-                yield ' | '
-                yield f'"\\\\"? "{c}"'
-            else:
-                yield f'[^{c}]'
-            if i < len(literal) - 1:
-                yield ' | '
-                yield self._format_literal(c)
-                yield ' ('
-                yield from recurse(i + 1)
-                yield ')?'
-
-        return ''.join(('(', *recurse(0), ')'))
-
    def _add_rule(self, name, rule):
        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
        if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -249,10 +169,10 @@ class SchemaConverter:

            def get_dot():
                if self._dotall:
-                    rule = DOTALL
+                    rule = '[\\U00000000-\\U0010FFFF]'
                else:
                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
-                    rule = DOT
+                    rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
                return self._add_rule(f'dot', rule)

            def join_seq():
@@ -326,14 +246,26 @@ class SchemaConverter:

                    (sub, sub_is_literal) = seq[-1]

-                    if not sub_is_literal:
-                        id = sub_rule_ids.get(sub)
-                        if id is None:
-                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
-                            sub_rule_ids[sub] = id
-                        sub = id
+                    if min_times == 0 and max_times is None:
+                        seq[-1] = (f'{sub}*', False)
+                    elif min_times == 0 and max_times == 1:
+                        seq[-1] = (f'{sub}?', False)
+                    elif min_times == 1 and max_times is None:
+                        seq[-1] = (f'{sub}+', False)
+                    else:
+                        if not sub_is_literal:
+                            id = sub_rule_ids.get(sub)
+                            if id is None:
+                                id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+                                sub_rule_ids[sub] = id
+                            sub = id

-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
+                        seq[-1] = (
+                            ' '.join(
+                                ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
+                                ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
+                            False
+                        )
                else:
                    literal = ''
                    while i < length:
@@ -441,47 +373,49 @@ class SchemaConverter:
                    ' "]" space')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
+                list_item_operator = f'( "," space {item_rule_name} )'
+                successive_items = ""
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')
+                if min_items > 0:
+                    successive_items = list_item_operator * (min_items - 1)
+                    min_items -= 1
+                if max_items is not None and max_items > min_items:
+                    successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
+                else:
+                    successive_items += list_item_operator + "*"
+                if min_items == 0:
+                    rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
+                else:
+                    rule = f'"[" space {item_rule_name} {successive_items} "]" space'
+                return self._add_rule(rule_name, rule)

        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)

        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
-            return self._add_primitive(
+            return self._add_rule(
                'root' if rule_name == 'root' else schema_format,
                PRIMITIVE_RULES['uuid']
            )

-        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
-            prim_name = f'{schema_format}-string'
-            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
-
-        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
-            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
-            min_len = schema.get('minLength', 0)
-            max_len = schema.get('maxLength')
-
-            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
+        elif schema_type in (None, 'string') and schema_format in DATE_RULES:
+            for t, r in DATE_RULES.items():
+                self._add_rule(t, r)
+            return schema_format + '-string'

        elif (schema_type == 'object') or (len(schema) == 0):
-            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
+            for n in OBJECT_RULE_NAMES:
+                self._add_rule(n, PRIMITIVE_RULES[n])
+            return self._add_rule(rule_name, 'object')

        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
-
-    def _add_primitive(self, name: str, rule: BuiltinRule):
-        n = self._add_rule(name, rule.content)
-
-        for dep in rule.deps:
-            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
-            assert dep_rule, f'Rule {dep} not known'
-            if dep not in self._rules:
-                self._add_primitive(dep, dep_rule)
-        return n
+            return self._add_rule(
+                'root' if rule_name == 'root' else schema_type,
+                PRIMITIVE_RULES[schema_type]
+            )

    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
        prop_order = self._prop_order
@@ -503,7 +437,7 @@ class SchemaConverter:
            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
            prop_kv_rule_names["*"] = self._add_rule(
                f'{sub_name}-kv',
-                self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+                self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
            )
            optional_props.append("*")

--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -26,21 +26,16 @@ options:
  -m, --model <filename>              (default: models/7B/ggml-model-q4_0.gguf)
  -p, --n-prompt <n>                  (default: 512)
  -n, --n-gen <n>                     (default: 128)
-  -pg <pp,tg>                         (default: 512,128)
-  -b, --batch-size <n>                (default: 2048)
-  -ub, --ubatch-size <n>              (default: 512)
-  -ctk, --cache-type-k <t>            (default: f16)
-  -ctv, --cache-type-v <t>            (default: f16)
-  -t, --threads <n>                   (default: 16)
+  -b, --batch-size <n>                (default: 512)
+  -ctk <t>, --cache-type-k <t>        (default: f16)
+  -ctv <t>, --cache-type-v <t>        (default: f16)
+  -t, --threads <n>                   (default: 112)
  -ngl, --n-gpu-layers <n>            (default: 99)
  -sm, --split-mode <none|layer|row>  (default: layer)
  -mg, --main-gpu <i>                 (default: 0)
  -nkvo, --no-kv-offload <0|1>        (default: 0)
-  -fa, --flash-attn <0|1>             (default: 0)
  -mmp, --mmap <0|1>                  (default: 1)
-  --numa <distribute|isolate|numactl> (default: disabled)
-  -embd, --embeddings <0|1>           (default: 0)
-  -ts, --tensor-split <ts0/ts1/..>    (default: 0)
+  -ts, --tensor_split <ts0/ts1/..>    (default: 0)
  -r, --repetitions <n>               (default: 5)
  -o, --output <csv|json|md|sql>      (default: md)
  -v, --verbose                       (default: 0)
@@ -48,11 +43,10 @@ options:
 Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
 ```

-llama-bench can perform three types of tests:
+llama-bench can perform two types of tests:

 - Prompt processing (pp): processing a prompt in batches (`-p`)
 - Text generation (tg): generating a sequence of tokens (`-n`)
- Prompt processing + text generation (pg): processing a prompt followed by generating a sequence of tokens (`-pg`)

 With the exception of `-r`, `-o` and `-v`, all options can be specified multiple times to run multiple tests. Each pp and tg test is run with all combinations of the specified options. To specify multiple values for an option, the values can be separated by commas (e.g. `-n 16,32`), or the option can be specified multiple times (e.g. `-n 16 -n 32`).

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -161,17 +161,10 @@ static const char * split_mode_str(llama_split_mode mode) {
    }
 }

-static std::string pair_str(const std::pair<int, int> & p) {
-    static char buf[32];
-    snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
-    return buf;
-}
-
 struct cmd_params {
    std::vector<std::string> model;
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
-    std::vector<std::pair<int, int>> n_pg;
    std::vector<int> n_batch;
    std::vector<int> n_ubatch;
    std::vector<ggml_type> type_k;
@@ -181,11 +174,9 @@ struct cmd_params {
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
-    std::vector<bool> flash_attn;
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    std::vector<bool> embeddings;
-    ggml_numa_strategy numa;
    int reps;
    bool verbose;
    output_formats output_format;
@@ -195,21 +186,18 @@ static const cmd_params cmd_params_defaults = {
    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
-    /* n_pg          */ {{512, 128}},
    /* n_batch       */ {2048},
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {get_math_cpu_count()},
+    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
-    /* flash_attn    */ {false},
    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
    /* embeddings    */ {false},
-    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
    /* reps          */ 5,
    /* verbose       */ false,
    /* output_format */ MARKDOWN
@@ -223,19 +211,16 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -pg <pp,tg>                         (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ub, --ubatch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
-    printf("  -ctk, --cache-type-k <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv, --cache-type-v <t>            (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -ub N, --ubatch-size <n>            (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
+    printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
-    printf("  --numa <distribute|isolate|numactl> (default: disabled)\n");
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
@@ -313,17 +298,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
-        } else if (arg == "-pg") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<std::string>(argv[i], ',');
-            if (p.size() != 2) {
-                invalid_param = true;
-                break;
-            }
-            params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
@@ -419,24 +393,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
-        } else if (arg == "--numa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            } else {
-                std::string value(argv[i]);
-                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-                else if (value == "isolate")                    { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-                else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-                else { invalid_param = true; break; }
-            }
-        } else if (arg == "-fa" || arg == "--flash-attn") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<bool>(argv[i], split_delim);
-            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
@@ -513,7 +469,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
-    if (params.n_pg.empty())         { params.n_pg = cmd_params_defaults.n_pg; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
    if (params.n_ubatch.empty())     { params.n_ubatch = cmd_params_defaults.n_ubatch; }
    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
@@ -522,7 +477,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
-    if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
@@ -544,7 +498,6 @@ struct cmd_params_instance {
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
-    bool flash_attn;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
@@ -579,7 +532,6 @@ struct cmd_params_instance {
        cparams.type_k = type_k;
        cparams.type_v = type_v;
        cparams.offload_kqv = !no_kv_offload;
-        cparams.flash_attn = flash_attn;
        cparams.embeddings = embeddings;

        return cparams;
@@ -602,7 +554,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
    for (const auto & nkvo : params.no_kv_offload)
-    for (const auto & fa : params.flash_attn)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@@ -621,7 +572,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
-                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
@@ -646,32 +596,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
-                /* .flash_attn   = */ fa,
-                /* .tensor_split = */ ts,
-                /* .use_mmap     = */ mmp,
-                /* .embeddings   = */ embd,
-            };
-            instances.push_back(instance);
-        }
-
-        for (const auto & n_pg : params.n_pg) {
-            if (n_pg.first == 0 && n_pg.second == 0) {
-                continue;
-            }
-            cmd_params_instance instance = {
-                /* .model        = */ m,
-                /* .n_prompt     = */ n_pg.first,
-                /* .n_gen        = */ n_pg.second,
-                /* .n_batch      = */ nb,
-                /* .n_ubatch     = */ nub,
-                /* .type_k       = */ tk,
-                /* .type_v       = */ tv,
-                /* .n_threads    = */ nt,
-                /* .n_gpu_layers = */ nl,
-                /* .split_mode   = */ sm,
-                /* .main_gpu     = */ mg,
-                /* .no_kv_offload= */ nkvo,
-                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
@@ -709,7 +633,6 @@ struct test {
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
-    bool flash_attn;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
@@ -734,7 +657,6 @@ struct test {
        split_mode = inst.split_mode;
        main_gpu = inst.main_gpu;
        no_kv_offload = inst.no_kv_offload;
-        flash_attn = inst.flash_attn;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
        embeddings = inst.embeddings;
@@ -809,7 +731,7 @@ struct test {
            "n_batch", "n_ubatch",
            "n_threads", "type_k", "type_v",
            "n_gpu_layers", "split_mode",
-            "main_gpu", "no_kv_offload", "flash_attn",
+            "main_gpu", "no_kv_offload",
            "tensor_split", "use_mmap", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
@@ -831,7 +753,7 @@ struct test {
        }
        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
+            field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -865,7 +787,7 @@ struct test {
            std::to_string(n_batch), std::to_string(n_ubatch),
            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
-            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
+            std::to_string(main_gpu), std::to_string(no_kv_offload),
            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -1011,9 +933,6 @@ struct markdown_printer : public printer {
        if (field == "n_gpu_layers") {
            return 3;
        }
-        if (field == "test") {
-            return 13;
-        }

        int width = std::max((int)field.length(), 10);

@@ -1036,9 +955,6 @@ struct markdown_printer : public printer {
        if (field == "no_kv_offload") {
            return "nkvo";
        }
-        if (field == "flash_attn") {
-            return "fa";
-        }
        if (field == "use_mmap") {
            return "mmap";
        }
@@ -1085,9 +1001,6 @@ struct markdown_printer : public printer {
        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
            fields.emplace_back("no_kv_offload");
        }
-        if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
-            fields.emplace_back("flash_attn");
-        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.emplace_back("tensor_split");
        }
@@ -1140,11 +1053,12 @@ struct markdown_printer : public printer {
                value = test::get_backend();
            } else if (field == "test") {
                if (t.n_prompt > 0 && t.n_gen == 0) {
-                    snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
+                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
                } else if (t.n_gen > 0 && t.n_prompt == 0) {
-                    snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
+                    snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
                } else {
-                    snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
+                    assert(false);
+                    exit(1);
                }
                value = buf;
            } else if (field == "t/s") {
@@ -1277,7 +1191,6 @@ int main(int argc, char ** argv) {
        llama_log_set(llama_null_log_callback, NULL);
    }
    llama_backend_init();
-    llama_numa_init(params.numa);

    // initialize printer
    std::unique_ptr<printer> p;
@@ -1345,7 +1258,6 @@ int main(int argc, char ** argv) {
            llama_kv_cache_clear(ctx);

            uint64_t t_start = get_time_ns();
-
            if (t.n_prompt > 0) {
                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
            }
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@@ -12,17 +12,15 @@ cmake_minimum_required(VERSION 3.22.1)
 # build script scope).
 project("llama-android")

-#include(FetchContent)
-#FetchContent_Declare(
-#        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-#        GIT_TAG        ci-android
-#)
-#
-## Also provides "common"
-#FetchContent_MakeAvailable(llama)
+include(FetchContent)
+FetchContent_Declare(
+        llama
+        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+        GIT_TAG        master
+)

-add_subdirectory(../../../../../../ please-work)
+# Also provides "common"
+FetchContent_MakeAvailable(llama)

 # Creates and names a library, sets it as either STATIC
 # or SHARED, and provides the relative paths to its source code.
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
        return env->NewStringUTF("");
    }

--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -158,7 +158,7 @@ actor LlamaContext {
            new_token_id = llama_sample_token_greedy(context, &candidates_p)
        }

-        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
+        if new_token_id == llama_token_eos(model) || n_cur == n_len {
            print("\n")
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
@@ -322,7 +322,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8, false)
+        let nTokens = llama_token_to_piece(model, token, result, 8)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +330,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
 python ./convert.py ../llava-v1.5-7b --skip-unknown
 ```

-Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
+Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.

 ## LLaVA 1.6 gguf conversion
 1) First clone a LLaVA 1.6 model:
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,7 +3,6 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
-#include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -24,6 +23,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
+#include <iostream>
 #include <map>
 #include <regex>
 #include <stdexcept>
@@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
-#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -146,7 +145,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
    int i = gguf_find_key(ctx, key);
    if (i == -1) {
-        LOG_TEE("key %s not found in file\n", key);
+        fprintf(stderr, "key %s not found in file\n", key);
        throw std::runtime_error(format("Missing required key: %s", key));
    }

@@ -248,7 +247,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
-    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -266,7 +265,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
        return;
    }

@@ -285,7 +284,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
        return;
    }

@@ -426,7 +425,6 @@ struct clip_vision_model {
    // embeddings
    struct ggml_tensor * class_embedding;
    struct ggml_tensor * patch_embeddings;
-    struct ggml_tensor * patch_bias;
    struct ggml_tensor * position_embeddings;

    struct ggml_tensor * pre_ln_w;
@@ -503,11 +501,6 @@ struct clip_ctx {
    bool use_gelu = false;
    int32_t ftype = 1;

-    bool has_class_embedding = true;
-    bool has_pre_norm = true;
-    bool has_post_norm = false;
-    bool has_patch_bias = false;
-
    struct gguf_context * ctx_gguf;
    struct ggml_context * ctx_data;

@@ -522,7 +515,7 @@ struct clip_ctx {

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        printf("This gguf file seems to have no vision encoder\n");
        return nullptr;
    }

@@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_positions        = num_patches + 1;
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
@@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

-    if (ctx->has_patch_bias) {
-        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
-        inp = ggml_add(ctx0, inp, model.patch_bias);
-    }
-
    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = inp;
-    if (ctx->has_class_embedding) {
-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        ggml_set_name(embeddings, "embeddings");
-        ggml_set_input(embeddings);
-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-        embeddings = ggml_acc(ctx0, embeddings, inp,
-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-    }
+    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    ggml_set_name(embeddings, "embeddings");
+    ggml_set_input(embeddings);

+    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+
+    embeddings = ggml_acc(ctx0, embeddings, inp,
+            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
@@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

    // pre-layernorm
-    if (ctx->has_pre_norm) {
+    {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");

@@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = cur;
    }

-    // post-layernorm
-    if (ctx->has_post_norm) {
-        embeddings = ggml_norm(ctx0, embeddings, eps);
-        ggml_set_name(embeddings, "post_ln");
-
-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
-    }
-
    // llava projector
    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -901,21 +879,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
+            printf("%s: model name:   %s\n", __func__, name.c_str());
        }
-        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
-        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_TEE("\n");
+        printf("%s: description:  %s\n", __func__, description.c_str());
+        printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        printf("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        printf("%s: n_tensors:    %d\n", __func__, n_tensors);
+        printf("%s: n_kv:         %d\n", __func__, n_kv);
+        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        printf("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
-    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
@@ -926,7 +904,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            n_type[type]++;
        }

-        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -942,7 +920,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            replace_all(value, "\n", "\\n");

-            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }

        // print type counts
@@ -951,7 +929,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                continue;
            }

-            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
    }

@@ -966,7 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            size_t tensor_size = ggml_nbytes(cur);
            model_size += tensor_size;
            if (verbosity >= 3) {
-                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
@@ -993,18 +971,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif

 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
-    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+    printf("%s: CLIP using Metal backend\n", __func__);
 #endif


    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
-        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+        printf("%s: CLIP using CPU backend\n", __func__);
    }

    // model size and capabilities
@@ -1028,15 +1006,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

        if (verbosity >= 1) {
-            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            printf("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);

    // load tensors
    {
@@ -1049,7 +1027,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        new_clip->ctx_data = ggml_init(params);
        if (!new_clip->ctx_data) {
-            LOG_TEE("%s: ggml_init() failed\n", __func__);
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1057,7 +1035,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        auto fin = std::ifstream(fname, std::ios::binary);
        if (!fin) {
-            LOG_TEE("cannot open model file for loading tensors\n");
+            printf("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1079,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
-                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+                printf("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
@@ -1150,61 +1128,34 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        if (verbosity >= 2) {
-            LOG_TEE("\n%s: vision model hparams\n", __func__);
-            LOG_TEE("image_size         %d\n", hparams.image_size);
-            LOG_TEE("patch_size         %d\n", hparams.patch_size);
-            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_TEE("v_n_head           %d\n", hparams.n_head);
-            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
-            LOG_TEE("v_eps              %f\n", hparams.eps);
-            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_TEE("v_image_grid_pinpoints: ");
+            printf("\n%s: vision model hparams\n", __func__);
+            printf("image_size         %d\n", hparams.image_size);
+            printf("patch_size         %d\n", hparams.patch_size);
+            printf("v_hidden_size      %d\n", hparams.hidden_size);
+            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
+            printf("v_projection_dim   %d\n", hparams.projection_dim);
+            printf("v_n_head           %d\n", hparams.n_head);
+            printf("v_n_layer          %d\n", hparams.n_layer);
+            printf("v_eps              %f\n", hparams.eps);
+            printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            printf("v_image_grid_pinpoints: ");
            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+                printf("%d ", hparams.image_grid_pinpoints[i]);
            }
-            LOG_TEE("\n");
-            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            printf("\n");
+            printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);

        }

-        try {
-            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-            new_clip->has_class_embedding = true;
-        } catch (const std::exception& e) {
-            new_clip->has_class_embedding = false;
-        }
-
-        try {
-            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-            new_clip->has_pre_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_pre_norm = false;
-        }
-
-        try {
-            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
-            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
-            new_clip->has_post_norm = true;
-        } catch (std::exception & e) {
-            new_clip->has_post_norm = false;
-        }
-
-        try {
-            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
-            new_clip->has_patch_bias = true;
-        } catch (std::exception & e) {
-            new_clip->has_patch_bias = false;
-        }
-
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
        } catch(const std::exception& e) {
-            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+            fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
        }

        // LLaVA projection
@@ -1233,7 +1184,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            } catch (std::runtime_error & e) {  }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & e) {  }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@@ -1313,7 +1264,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
@@ -1353,7 +1304,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1365,7 +1316,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
    int nx, ny, nc;
    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
-        LOG_TEE("%s: failed to decode image bytes\n", __func__);
+        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1374,7 +1325,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
 }

 // Linear interpolation between two points
-inline float clip_lerp(float s, float e, float t) {
+inline float lerp(float s, float e, float t) {
    return s + (e - s) * t;
 }
 // Bilinear resize function
@@ -1396,17 +1347,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
            float y_lerp = py - y_floor;

            for (int c = 0; c < 3; c++) {
-                float top = clip_lerp(
+                float top = lerp(
                    static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
                    static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
                    x_lerp
                );
-                float bottom = clip_lerp(
+                float bottom = lerp(
                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
                    x_lerp
                );
-                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
            }
        }
    }
@@ -1555,7 +1506,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -1594,7 +1545,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        printf("This gguf file seems to have no vision encoder\n");
        return false;
    }
    auto & params = ctx->vision_model.hparams;
@@ -1671,7 +1622,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }

            for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                clip_image_u8_free(patches[i]);
            }

@@ -1814,7 +1765,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {

 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        printf("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -1826,7 +1777,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        printf("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -1846,7 +1797,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int image_size    = hparams.image_size;
    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
-    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_positions = num_patches + 1;

    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1874,14 +1825,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    {
-        if (ctx->has_class_embedding) {
-            struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
+        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");

-            void* zero_mem = malloc(ggml_nbytes(embeddings));
-            memset(zero_mem, 0, ggml_nbytes(embeddings));
-            ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
-            free(zero_mem);
-        }
+        void* zero_mem = malloc(ggml_nbytes(embeddings));
+        memset(zero_mem, 0, ggml_nbytes(embeddings));
+        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
+        free(zero_mem);
    }

    {
@@ -1990,7 +1939,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@@ -2009,7 +1958,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                f32_data = (float *)conv_buf.data();
                break;
            default:
-                LOG_TEE("Please use an input file in f32 or f16\n");
+                printf("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
@@ -2036,7 +1985,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -2052,8 +2001,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
    }

    return true;
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,5 +1,4 @@
 #include "ggml.h"
-#include "log.h"
 #include "common.h"
 #include "clip.h"
 #include "llava.h"
@@ -19,7 +18,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -46,7 +45,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
@@ -74,7 +73,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
    size_t img_base64_str_start, img_base64_str_end;
    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
        return NULL;
    }

@@ -88,7 +87,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip

    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
    if (!embed) {
-        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
        return NULL;
    }

@@ -113,29 +112,29 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

-static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
+static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {

    // load and preprocess the image
    llava_image_embed * embed = NULL;
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            LOG_TEE("using base64 encoded image instead of command line image path\n");
+            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
        if (!embed) {
-            LOG_TEE("%s: can't load image from prompt\n", __func__);
+            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
            return NULL;
        }
    }
@@ -154,18 +153,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+        printf("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
-        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+        printf("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    } else {
@@ -175,7 +174,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    }
@@ -186,14 +185,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    // generate the response

-    LOG_TEE("\n");
+    fprintf(stderr, "\n");

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    if (!ctx_sampling) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
@@ -212,21 +206,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    printf("\n");
 }

-static struct llama_model * llava_init(gpt_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);

-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
+static struct llava_context * llava_init(gpt_params * params) {
    const char * clip_path = params->mmproj.c_str();

    auto prompt = params->prompt;
@@ -236,6 +217,16 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode

    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return NULL;
+    }

    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -243,7 +234,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

@@ -266,12 +257,6 @@ static void llava_free(struct llava_context * ctx_llava) {
    llama_backend_free();
 }

-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
 int main(int argc, char ** argv) {
    ggml_time_init();

@@ -281,58 +266,29 @@ int main(int argc, char ** argv) {
        show_additional_info(argc, argv);
        return 1;
    }
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
-
    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
-    auto model = llava_init(&params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
+
+    auto ctx_llava = llava_init(&params);
+    if (ctx_llava == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
        return 1;
    }

-    if (prompt_contains_image(params.prompt)) {
-        auto ctx_llava = llava_init_context(&params, model);
-
-        auto image_embed = load_image(ctx_llava, &params, "");
-
-        // process the prompt
-        process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-        llama_print_timings(ctx_llava->ctx_llama);
-        llava_image_embed_free(image_embed);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-    } else {
-        for (auto & image : params.image) {
-            auto ctx_llava = llava_init_context(&params, model);
-
-            auto image_embed = load_image(ctx_llava, &params, image);
-            if (!image_embed) {
-                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
-                return 1;
-            }
-
-            // process the prompt
-            process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-            llama_print_timings(ctx_llava->ctx_llama);
-            llava_image_embed_free(image_embed);
-            ctx_llava->model = NULL;
-            llava_free(ctx_llava);
-        }
+    auto image_embed = load_image(ctx_llava, &params);
+    if (!image_embed) {
+        return 1;
    }

-    llama_free_model(model);
+    // process the prompt
+    process_prompt(ctx_llava, image_embed, &params, params.prompt);

+    llama_print_timings(ctx_llava->ctx_llama);
+
+    llava_image_embed_free(image_embed);
+    llava_free(ctx_llava);
    return 0;
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -54,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -88,6 +88,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
 static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
    struct {
+        struct ggml_tensor * newline;
        struct ggml_context * ctx;
    } model;

@@ -149,6 +150,20 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>

    model.ctx = ggml_init(params);

+    ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
+    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
+    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
+        if (newline_tmp->buffer == NULL) {
+            printf("newline_tmp tensor buffer is NULL\n");
+        }
+        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
+    } else {
+        model.newline->data = newline_tmp->data;
+        if (model.newline->data == NULL) {
+            printf("newline_tmp tensor data is NULL\n");
+        }
+    }
+
    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
    // fill it with the image embeddings, ignoring the base
@@ -209,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    img_res_v.size = 0;
    img_res_v.data = nullptr;
    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
    }
@@ -224,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
        delete[] img_res_v.data;
        if (!encoded) {
-            LOG_TEE("Unable to encode image\n");
+            fprintf(stderr, "Unable to encode image\n");

            return false;
        }
@@ -237,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);

@@ -275,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
    }

-    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

-    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

    return true;
 }
@@ -290,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
-        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
        return false;
    }
    return true;
@@ -299,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
    if (!image_embd) {
-        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
        return false;
    }

    int n_img_pos;
    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
        free(image_embd);
        return false;
    }
@@ -325,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
@@ -337,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
-        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
        return NULL;
    }

@@ -346,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
    if (!image_embed_result) {
        clip_image_u8_free(img);
-        LOG_TEE("%s: coulnd't embed the image\n", __func__);
+        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
        return NULL;
    }

@@ -360,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
    auto file = fopen(path, "rb");
    if (file == NULL) {
-        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
        return false;
    }

@@ -370,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long

    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
    if (buffer == NULL) {
-        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
        perror("Memory allocation error");
        fclose(file);
        return false;
@@ -395,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
    if (!loaded) {
-        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
        return NULL;
    }

--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
                }
                fflush(stdout);

-                if (llama_token_is_eog(model, id)) {
+                if (id == llama_token_eos(model)) {
                    has_eos = true;
                }

--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -30,6 +30,7 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_set_rng_seed(ctx, params.seed);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -38,6 +38,7 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_set_rng_seed(ctx, params.seed);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
@@ -140,7 +141,7 @@ int main(int argc, char ** argv){
                printf("%s", token_str.c_str());
            }

-            if (llama_token_is_eog(model, id)) {
+            if (id == llama_token_eos(model)) {
                has_eos = true;
            }

--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -17,9 +17,11 @@ In this case, CLBlast was already installed so the CMake package is referenced i
 ```cmd
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
-cmake --build build --config Release
-cmake --install build --prefix C:/LlamaCPP
+mkdir build
+cd build
+cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake --build . --config Release
+cmake --install . --prefix C:/LlamaCPP
 ```

 ### Build main-cmake-pkg
@@ -27,7 +29,9 @@ cmake --install build --prefix C:/LlamaCPP

 ```cmd
 cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
-cmake --build build --config Release
-cmake --install build --prefix C:/MyLlamaApp
+mkdir build
+cd build
+cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake --build . --config Release
+cmake --install . --prefix C:/MyLlamaApp
 ```
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt

 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:

-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
@@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by

 ### Extended Context Size

-Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.

 -   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.

@@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
 -   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
+-   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.

 These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.

@@ -304,12 +304,10 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

-### Grammars & JSON schemas
+### Grammars

 -   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.

-   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
-
 ### Quantization

 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -240,6 +240,7 @@ int main(int argc, char ** argv) {
                return 1;
            }
            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);
            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
        }
    }
@@ -324,7 +325,7 @@ int main(int argc, char ** argv) {
            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());

    // if we will use the cache for the full prompt without reaching the end of the cache, force
-    // reevaluation of the last token to recalculate the cached logits
+    // reevaluation of the last token token to recalculate the cached logits
    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);

@@ -362,9 +363,6 @@ int main(int argc, char ** argv) {
        params.interactive_first = true;
        params.antiprompt.emplace_back("<|im_start|>user\n");
    }
-    else if (params.conversation) {
-        params.interactive_first = true;
-    }

    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
@@ -523,10 +521,6 @@ int main(int argc, char ** argv) {
    }

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    if (!ctx_sampling) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@@ -551,7 +545,7 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
+                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
@@ -740,7 +734,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo && display) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
+                const std::string token_str = llama_token_to_piece(ctx, id);
                printf("%s", token_str.c_str());

                if (embd.size() > 1) {
@@ -801,9 +795,9 @@ int main(int argc, char ** argv) {
                }
            }

-            // deal with end of generation tokens in interactive mode
-            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
-                LOG("found an EOG token\n");
+            // deal with end of text token in interactive mode
+            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+                LOG("found EOS token\n");

                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
@@ -823,7 +817,7 @@ int main(int argc, char ** argv) {
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");

-                if (params.conversation || params.instruct || params.chatml) {
+                if (params.instruct || params.chatml) {
                    printf("\n> ");
                }

@@ -833,7 +827,7 @@ int main(int argc, char ** argv) {
                }

                std::string buffer;
-                if (!params.input_prefix.empty() && !params.conversation) {
+                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    printf("%s", params.input_prefix.c_str());
                }
@@ -857,7 +851,7 @@ int main(int argc, char ** argv) {
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
                    // append input suffix if any
-                    if (!params.input_suffix.empty() && !params.conversation) {
+                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        printf("%s", params.input_suffix.c_str());
                    }
@@ -883,7 +877,7 @@ int main(int argc, char ** argv) {
                    }

                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, params.interactive_specials);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);

                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -926,8 +920,8 @@ int main(int argc, char ** argv) {
            }
        }

-        // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
+        // end of text token
+        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -359,7 +359,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (llama_token_is_eog(model, id) ||
+                        (id == llama_token_eos(model) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            // is it an end of stream?
+            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
                LOG_TEE("\n");

                break;
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -1,176 +1,8 @@
-# Perplexity
+# perplexity

-The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
-Perplexity measures how well the model can predict the next token with lower values being better.
-Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
-Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.
-
-Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
-The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).
-When numbers are listed all command line arguments and compilation options are left at their defaults unless noted otherwise.
-llama.cpp numbers are **not** directly comparable to those of other projects because the exact values depend strongly on the implementation details.
-
-By default only the mean perplexity value and the corresponding uncertainty is calculated.
-The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
-
-More statistics can be obtained by recording the logits from the FP16 version of a model.
-To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
-The program will then record all logits and save them to the provided path in binary format.
-**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
-Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
-and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
-This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
-The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
-
-In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
-
-* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
-* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
-* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
-* Pearson correlation coefficient of the "correct" token probabilites between models.
-* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
-* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
-* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
-
-## LLaMA 3 8b Scoreboard
-
-| Revision | f364eb6f           |
-|:---------|:-------------------|
-| Backend  | CUDA               |
-| CPU      | AMD Epyc 7742      |
-| GPU      | 1x NVIDIA RTX 4090 |
-
-Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
-The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
-
-| Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
-|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
-| f16          | None    |            14.97 | 6.233160 ±   0.037828  | -                      | -                     | -                 | -                |
-| q8_0         | None    |             7.96 | 6.234284 ±   0.037878  | 0.002650 ±   0.001006  | 0.001355 ±   0.000006 | -0.019 ± 0.003 %  | 1.198 ± 0.007 %  |
-| q6_K         | None    |             6.14 | 6.253382 ±   0.038078  | 0.021748 ±   0.001852  | 0.005452 ±   0.000035 | -0.007 ± 0.006 %  | 2.295 ± 0.019 %  |
-| q5_K_M       | None    |             5.33 | 6.288607 ±   0.038338  | 0.056974 ±   0.002598  | 0.010762 ±   0.000079 | -0.114 ± 0.008 %  | 3.160 ± 0.031 %  |
-| q5_K_S       | None    |             5.21 | 6.336598 ±   0.038755  | 0.104964 ±   0.003331  | 0.016595 ±   0.000122 | -0.223 ± 0.010 %  | 3.918 ± 0.036 %  |
-| q5_1         | None    |             5.65 | 6.337857 ±   0.038677  | 0.106223 ±   0.003476  | 0.018045 ±   0.000139 | -0.287 ± 0.011 %  | 4.123 ± 0.039 %  |
-| q5_0         | None    |             5.21 | 6.363224 ±   0.038861  | 0.131591 ±   0.003894  | 0.022239 ±   0.000166 | -0.416 ± 0.012 %  | 4.634 ± 0.043 %  |
-| q4_K_M       | WT 10m  |             4.58 | 6.382937 ±   0.039055  | 0.151303 ±   0.004429  | 0.028152 ±   0.000240 | -0.389 ± 0.014 %  | 5.251 ± 0.049 %  |
-| q4_K_M       | None    |             4.58 | 6.407115 ±   0.039119  | 0.175482 ±   0.004620  | 0.031273 ±   0.000238 | -0.596 ± 0.014 %  | 5.519 ± 0.050 %  |
-| q4_K_S       | WT 10m  |             4.37 | 6.409697 ±   0.039189  | 0.178064 ±   0.004744  | 0.031951 ±   0.000259 | -0.531 ± 0.015 %  | 5.645 ± 0.051 %  |
-| iq4_NL       | WT 10m  |             4.35 | 6.455593 ±   0.039630  | 0.223959 ±   0.005201  | 0.035742 ±   0.000288 | -0.590 ± 0.016 %  | 5.998 ± 0.054 %  |
-| iq4_XS       | WT 10m  |             4.14 | 6.459705 ±   0.039595  | 0.228071 ±   0.005207  | 0.036334 ±   0.000284 | -0.668 ± 0.016 %  | 6.044 ± 0.054 %  |
-| q4_K_S       | None    |             4.37 | 6.500529 ±   0.039778  | 0.268895 ±   0.005638  | 0.043136 ±   0.000314 | -0.927 ± 0.017 %  | 6.562 ± 0.055 %  |
-| q4_1         | None    |             4.78 | 6.682737 ±   0.041285  | 0.451103 ±   0.008030  | 0.071683 ±   0.000505 | -0.927 ± 0.017 %  | 8.512 ± 0.063 %  |
-| q4_0         | None    |             4.34 | 6.700147 ±   0.041226  | 0.468514 ±   0.007951  | 0.071940 ±   0.000491 | -1.588 ± 0.022 %  | 8.434 ± 0.061 %  |
-| q3_K_L       | WT 10m  |             4.03 | 6.671223 ±   0.041427  | 0.439590 ±   0.008154  | 0.073077 ±   0.000529 | -0.940 ± 0.023 %  | 8.662 ± 0.064 %  |
-| q3_K_M       | WT 10m  |             3.74 | 6.734255 ±   0.041838  | 0.502622 ±   0.008901  | 0.084358 ±   0.000588 | -1.198 ± 0.024 %  | 9.292 ± 0.065 %  |
-| q3_K_L       | None    |             4.03 | 6.787876 ±   0.042104  | 0.556242 ±   0.009171  | 0.087176 ±   0.000614 | -1.532 ± 0.025 %  | 9.432 ± 0.067 %  |
-| q3_K_M       | None    |             3.74 | 6.888498 ±   0.042669  | 0.656864 ±   0.010071  | 0.101913 ±   0.000677 | -1.990 ± 0.026 %  | 10.203 ± 0.068 % |
-| iq3_M        | WT 10m  |             3.53 | 6.898327 ±   0.041643  | 0.666694 ±   0.009449  | 0.102534 ±   0.000663 | -3.178 ± 0.026 %  | 10.513 ± 0.066 % |
-| iq3_S        | WT 10m  |             3.42 | 6.965501 ±   0.042406  | 0.733867 ±   0.010245  | 0.111278 ±   0.000710 | -3.066 ± 0.027 %  | 10.845 ± 0.068 % |
-| iq3_XS       | WT 10m  |             3.28 | 7.163043 ±   0.043772  | 0.931409 ±   0.012084  | 0.138693 ±   0.000857 | -3.667 ± 0.031 %  | 12.148 ± 0.070 % |
-| iq3_XXS      | WT 10m  |             3.05 | 7.458436 ±   0.046404  | 1.226803 ±   0.015234  | 0.183625 ±   0.001042 | -3.918 ± 0.035 %  | 13.836 ± 0.074 % |
-| q3_K_S       | WT 10m  |             3.41 | 7.602878 ±   0.046848  | 1.371244 ±   0.015688  | 0.199821 ±   0.001008 | -5.046 ± 0.037 %  | 14.980 ± 0.070 % |
-| q3_K_S       | None    |             3.41 | 7.863786 ±   0.048885  | 1.632152 ±   0.017733  | 0.228217 ±   0.001079 | -5.604 ± 0.038 %  | 15.541 ± 0.070 % |
-| iq2_M        | WT 10m  |             2.74 | 8.600799 ±   0.055124  | 2.369166 ±   0.025244  | 0.325989 ±   0.00160  | -6.463 ± 0.046 %  | 18.519 ± 0.080 % |
-| q2_K         | WT 10k  |             2.96 | 8.652290 ±   0.055572  | 2.420657 ±   0.025587  | 0.331393 ±   0.001562 | -6.606 ± 0.046 %  | 18.790 ± 0.078 % |
-| q2_K         | WT 100k |             2.96 | 8.641993 ±   0.055406  | 2.410359 ±   0.025495  | 0.331672 ±   0.001569 | -6.628 ± 0.047 %  | 18.856 ± 0.078 % |
-| q2_K         | WT 10m  |             2.96 | 8.647825 ±   0.055610  | 2.416191 ±   0.025683  | 0.332223 ±   0.001572 | -6.500 ± 0.047 %  | 18.881 ± 0.078 % |
-| q2_K         | WT 1m   |             2.96 | 8.674365 ±   0.055743  | 2.442732 ±   0.025843  | 0.335308 ±   0.001576 | -6.634 ± 0.047 %  | 19.009 ± 0.079 % |
-| q2_K         | WT 1k   |             2.96 | 8.682605 ±   0.055916  | 2.450972 ±   0.026069  | 0.337093 ±   0.001596 | -6.596 ± 0.047 %  | 18.977 ± 0.079 % |
-| q2_K_S       | WT 10m  |             2.96 | 9.323778 ±   0.061551  | 3.092145 ±   0.031914  | 0.403360 ±   0.001787 | -7.131 ± 0.049 %  | 20.050 ± 0.081 % |
-| q2_K_S       | WT 1m   |             2.96 | 9.329321 ±   0.061378  | 3.097688 ±   0.031816  | 0.403590 ±   0.001797 | -7.289 ± 0.049 %  | 20.123 ± 0.081 % |
-| q2_K_S       | WT 100k |             2.96 | 9.362973 ±   0.061740  | 3.131339 ±   0.032169  | 0.408367 ±   0.001802 | -7.198 ± 0.050 %  | 20.132 ± 0.081 % |
-| q2_K_S       | WT 10k  |             2.96 | 9.376479 ±   0.062045  | 3.144846 ±   0.032464  | 0.408662 ±   0.001819 | -7.141 ± 0.050 %  | 20.120 ± 0.081 % |
-| q2_K_S       | WT 1k   |             2.96 | 9.415200 ±   0.062475  | 3.183567 ±   0.032993  | 0.415865 ±   0.001846 | -7.153 ± 0.050 %  | 20.311 ± 0.082 % |
-| iq2_S        | WT 10m  |             2.56 | 9.650781 ±   0.063209  | 3.419148 ±   0.034017  | 0.439197 ±   0.001976 | -8.319 ± 0.052 %  | 21.491 ± 0.083 % |
-| q2_K         | None    |             2.96 | 9.751568 ±   0.063312  | 3.519934 ±   0.033863  | 0.445132 ±   0.001835 | -9.123 ± 0.051 %  | 21.421 ± 0.079 % |
-| iq2_XS       | WT 10m  |             2.43 | 10.761424 ±   0.071056 | 4.529791 ±   0.042229  | 0.546290 ±   0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
-| iq2_XXS      | WT 10m  |             2.24 | 14.091782 ±   0.098396 | 7.860148 ±   0.070752  | 0.812022 ±   0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
-| iq1_M        | WT 10m  |             2.01 | 25.493722 ±   0.177903 | 19.262089 ±   0.152396 | 1.393084 ±   0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
-| iq1_S        | WT 1m   |             1.88 | 58.097760 ±   0.438604 | 51.866126 ±   0.416604 | 2.211278 ±   0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
-| iq1_S        | WT 1k   |             1.88 | 58.267851 ±   0.446208 | 52.036218 ±   0.424373 | 2.214858 ±   0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
-| iq1_S        | WT 100k |             1.88 | 58.581498 ±   0.453145 | 52.349864 ±   0.431360 | 2.220834 ±   0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
-| iq1_S        | WT 10m  |             1.88 | 60.694593 ±   0.471290 | 54.462959 ±   0.449644 | 2.254554 ±   0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
-| iq1_S        | WT 10k  |             1.88 | 63.221324 ±   0.493077 | 56.989691 ±   0.471423 | 2.293527 ±   0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
-
-There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
-K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
-
-## LLaMA 2 vs. LLaMA 3 Quantization comparison
-
-| Revision | f364eb6f           |
-|:---------|:-------------------|
-| Backend  | CUDA               |
-| CPU      | AMD Epyc 7742      |
-| GPU      | 1x NVIDIA RTX 4090 |
-
-| Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
-|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
-| Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
-| Mean PPL ratio  | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
-| Mean ΔPPL       | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
-| PPL correlation |              97.36% |              89.62% |              99.71% |              99.34% |              99.94% |              99.88% |              99.98% |              99.96% |
-| Mean KLD        | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
-| Mean Δp         |    -2.710 ± 0.023 % |    -9.123 ± 0.051 % |    -0.416 ± 0.008 % |    -0.596 ± 0.014 % |    -0.035 ± 0.003 % |    -0.007 ± 0.006 % |    -0.005 ± 0.002 % |    -0.019 ± 0.003 % |
-| Maximum Δp      |             85.136% |             94.268% |             45.209% |             95.054% |             23.593% |             53.601% |             43.925% |             28.734% |
-| 99.9% Δp        |             37.184% |             50.003% |             17.461% |             27.084% |              7.798% |             13.613% |              3.387% |              6.402% |
-| 99.0% Δp        |             18.131% |             25.875% |              7.798% |             12.084% |              3.838% |              6.407% |              1.867% |              3.544% |
-| Median Δp       |             -0.391% |             -2.476% |             -0.026% |             -0.024% |             -0.001% |              0.000% |             -0.000% |             -0.000% |
-| 1.0% Δp         |            -39.762% |            -87.173% |            -11.433% |            -19.567% |             -4.222% |             -6.767% |             -1.862% |             -3.698% |
-| 0.1% Δp         |            -79.002% |            -98.897% |            -26.433% |            -56.054% |             -9.091% |            -16.584% |             -3.252% |             -6.579% |
-| Minimum Δp      |            -99.915% |            -99.965% |            -83.383% |            -98.699% |            -43.142% |            -68.487% |             -9.343% |            -24.301% |
-| RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
-| Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
-
-## LLaMA 3 BF16 vs. FP16 comparison
-
-| Revision | 83330d8c      |
-|:---------|:--------------|
-| Backend  | CPU           |
-| CPU      | AMD Epyc 7742 |
-| GPU      | N/A           |
-
-Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA 3 8b FP16 as the `--model` for comparison.
-
-| Metric                         |                    Value |
-|--------------------------------|--------------------------|
-| Mean PPL(Q)                    |      6.227711 ± 0.037833 |
-| Mean PPL(base)                 |      6.225194 ± 0.037771 |
-| Cor(ln(PPL(Q)), ln(PPL(base))) |                  99.990% |
-| Mean ln(PPL(Q)/PPL(base))      |      0.000404 ± 0.000086 |
-| Mean PPL(Q)/PPL(base)          |      1.000404 ± 0.000086 |
-| Mean PPL(Q)-PPL(base)          |      0.002517 ± 0.000536 |
-| Mean    KLD                    |  0.00002515 ± 0.00000020 |
-| Maximum KLD                    |                 0.012206 |
-| 99.9%   KLD                    |                 0.000799 |
-| 99.0%   KLD                    |                 0.000222 |
-| 99.0%   KLD                    |                 0.000222 |
-| Median  KLD                    |                 0.000013 |
-| 10.0%   KLD                    |                -0.000002 |
-| 5.0%   KLD                     |                -0.000008 |
-| 1.0%   KLD                     |                -0.000023 |
-| Minimum KLD                    |                -0.000059 |
-| Mean    Δp                     | -0.0000745 ± 0.0003952 % |
-| Maximum Δp                     |                   4.186% |
-| 99.9%   Δp                     |                   1.049% |
-| 99.0%   Δp                     |                   0.439% |
-| 95.0%   Δp                     |                   0.207% |
-| 90.0%   Δp                     |                   0.125% |
-| 75.0%   Δp                     |                   0.029% |
-| Median  Δp                     |                   0.000% |
-| 25.0%   Δp                     |                  -0.030% |
-| 10.0%   Δp                     |                  -0.126% |
-| 5.0%   Δp                      |                  -0.207% |
-| 1.0%   Δp                      |                  -0.434% |
-| 0.1%   Δp                      |                  -1.016% |
-| Minimum Δp                     |                  -4.672% |
-| RMS Δp                         |          0.150 ± 0.001 % |
-| Same top p                     |         99.739 ± 0.013 % |
-
-## Old Numbers
-
-<details>
-<summary>Llama 2 70B Scoreboard</summary>
+TODO

+## Llama 2 70B Scorechart
 | Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
 |--------------|------------------|------------|---------------|
 | Q4_0         | 36.20            | 3.5550     | 3.61%         |
@@ -186,5 +18,3 @@ Results were calculated with LLaMA 3 8b BF16 as `--kl-divergence-base` and LLaMA
 | Q5_K_M       | 45.41            | 3.4451     | 0.40%         |
 | Q6_K         | 52.70            | 3.4367     | 0.16%         |
 | fp16         | 128.5            | 3.4313     | -             |
-
-</details>
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -216,22 +216,17 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
 }

 struct kl_divergence_result {
-    double sum_nll          = 0.0;
-    double sum_nll2         = 0.0;
-    double sum_nll_base     = 0.0;
-    double sum_nll_base2    = 0.0;
-    double sum_nll_nll_base = 0.0;
-    double sum_kld          = 0.0;
-    double sum_kld2         = 0.0;
-    double sum_p_diff       = 0.0;
-    double sum_p_diff2      = 0.0;
-    double sum_p_diff4      = 0.0;
-    float  max_p_diff       = 0.0f;
-    size_t n_same_top       = 0.0;
-    size_t count            = 0.0;
+    double sum_nll  = 0;
+    double sum_nll2 = 0;
+    double sum_kld  = 0;
+    double sum_kld2 = 0;
+    double sum_nll_diff  = 0;
+    double sum_nll_diff2 = 0;
+    size_t n_same_top = 0;
+    size_t count = 0;
 };

-static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
    float max_logit = logits[0];
    int imax = 0;
    for (int i = 1; i < n_vocab; ++i) {
@@ -249,17 +244,12 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
    const float scale = d[0];
    const float min_log_prob = d[1];
    base_log_prob += 4;
-
-    const float nll = max_logit + log_sum_exp - logits[tok];
+    float nll = max_logit + log_sum_exp - logits[tok];
    kld.sum_nll  += nll;
    kld.sum_nll2 += nll*nll;
-
-    const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
-    kld.sum_nll_base  += nll_base;
-    kld.sum_nll_base2 += nll_base*nll_base;
-
-    kld.sum_nll_nll_base += nll*nll_base;
-
+    nll += (scale*base_log_prob[tok] + min_log_prob);
+    kld.sum_nll_diff  += nll;
+    kld.sum_nll_diff2 += nll*nll;
    max_logit += log_sum_exp;
    double sum = 0;
    int imax_base = -1;
@@ -279,50 +269,34 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
    kld.sum_kld2 += sum*sum;
    ++kld.count;
    if (imax == imax_base) ++kld.n_same_top;
-
-    const float p_base = expf(-nll_base);
-    const float p = expf(-nll);
-    const float p_diff = p - p_base;
-    kld.sum_p_diff  += p_diff;
-    const double p_diff2 = p_diff*p_diff;
-    kld.sum_p_diff2 += p_diff2;
-    kld.sum_p_diff4 += p_diff2*p_diff2;
-    kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
-
-    return std::make_pair(sum, p_diff);
+    return sum;
 }

 static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
-        float * kld_values, float * p_diff_values) {
+        float * kld_values) {
    std::mutex mutex;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    int counter = 0;
-    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
        kl_divergence_result local_kld;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
-                kld.sum_nll          += local_kld.sum_nll;
-                kld.sum_nll2         += local_kld.sum_nll2;
-                kld.sum_nll_base     += local_kld.sum_nll_base;
-                kld.sum_nll_base2    += local_kld.sum_nll_base2;
-                kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
-                kld.sum_kld          += local_kld.sum_kld;
-                kld.sum_kld2         += local_kld.sum_kld2;
-                kld.sum_p_diff       += local_kld.sum_p_diff;
-                kld.sum_p_diff2      += local_kld.sum_p_diff2;
-                kld.sum_p_diff4      += local_kld.sum_p_diff4;
-                kld.n_same_top       += local_kld.n_same_top;
-                kld.max_p_diff        = std::max(kld.max_p_diff, local_kld.max_p_diff);
-                kld.count            += local_kld.count;
+                kld.sum_nll  += local_kld.sum_nll;
+                kld.sum_nll2 += local_kld.sum_nll2;
+                kld.sum_kld  += local_kld.sum_kld;
+                kld.sum_kld2 += local_kld.sum_kld2;
+                kld.sum_nll_diff  += local_kld.sum_nll_diff;
+                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
+                kld.n_same_top += local_kld.n_same_top;
+                kld.count += local_kld.count;
                break;
            }
            lock.unlock();
-            std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
-            kld_values[i]    = (float)v.first;
-            p_diff_values[i] = v.second;
+            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            kld_values[i] = (float)v;
        }
    };
    for (auto & w : workers) {
@@ -1737,8 +1711,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
-    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
-    std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
    std::vector<float> logits;
    if (num_batches > 1) {
        logits.reserve(n_ctx * n_vocab);
@@ -1755,18 +1728,9 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
        return std::make_pair(f, df);
    };
-    auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
-        if (count < 10) {
-            return 0.0;
-        }
-        double var = sumab/count - (suma/count)*(sumb/count);
-        var /= count - 1;
-        return var;
-    };

    kl_divergence_result kld;
-    auto    kld_ptr =    kld_values.data();
-    auto p_diff_ptr = p_diff_values.data();
+    auto kld_ptr = kld_values.data();

    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
@@ -1821,42 +1785,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
            }
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);

-            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
+            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
        }

        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
-        p_diff_ptr += n_ctx - 1 - first;
-        kld_ptr    += n_ctx - 1 - first;
+                workers, log_probs_uint16, kld, kld_ptr);
+        kld_ptr += n_ctx - 1 - first;

-        printf("%4d", i+1);
+        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
+        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+        auto p_top = 1.*kld.n_same_top/kld.count;
+        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));

-        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
-        const double ppl_val = exp(log_ppl.first);
-        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
-
-        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
-        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
-        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
-
-        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
-
-        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-        const double p_diff_rms_val = sqrt(p_diff_mse.first);
-        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
-
-        double p_top_val = 1.*kld.n_same_top/kld.count;
-        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
-
-        printf("\n");
+        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
+                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
+                p_top, d_p_top);

        fflush(stdout);

@@ -1867,97 +1813,31 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (kld.count < 100) return; // we do not wish to do statistics on so few values

    std::sort(kld_values.begin(), kld_values.end());
-    std::sort(p_diff_values.begin(), p_diff_values.end());

-    printf("====== Perplexity statistics ======\n");
-
-    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
-    const double ppl_val = exp(log_ppl.first);
-    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
-
-    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
-    const double ppl_base_val = exp(log_ppl_base.first);
-    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
-
-    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
-    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
-
-    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
-    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
-
-    const double ppl_ratio_val = exp(log_ppl_ratio_val);
-    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
-
-    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
-    const double ppl_diff_val = ppl_val - ppl_base_val;
-    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
-
-    printf("\n");
-
-    printf("====== KL divergence statistics ======\n");
+    printf("===== KL-divergence statistics\n");
    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                               : kld_values[kld_values.size()/2];
+    printf("Median : %10.6f\n", kld_median);

-    auto percentile = [] (std::vector<float> values, float fraction) {
-        if (fraction <= 0) return values.front();
-        if (fraction >= 1) return values.back();
-        float p = fraction*(values.size() - 1);
+    auto percentile = [&kld_values] (float fraction) {
+        if (fraction <= 0) return kld_values.front();
+        if (fraction >= 1) return kld_values.back();
+        float p = fraction*(kld_values.size() - 1);
        size_t ip = size_t(p); p -= ip;
-        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
+        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
    };

-    printf("Maximum KLD: %10.6f\n", kld_values.back());
-    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("Median  KLD: %10.6f\n", kld_median);
-    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    printf("Minimum KLD: %10.6f\n", kld_values.front());
+    printf("Maximum: %10.6f\n", kld_values.back());
+    printf("KLD_99 : %10.6f\n", percentile(0.99f));
+    printf("KLD_95 : %10.6f\n", percentile(0.95f));
+    printf("KLD_90 : %10.6f\n", percentile(0.90f));

-    printf("\n");
-
-    printf("====== Token probability statistics ======\n");
-
-    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
-
-    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
-                                               : p_diff_values[p_diff_values.size()/2];
-
-    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
-
-    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
-
-    const double p_diff_rms_val = sqrt(p_diff_mse.first);
-    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
-
-    const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
+    printf("Minimum: %10.6f\n", kld_values.front());
+    printf("KLD_01 : %10.6f\n", percentile(0.01f));
+    printf("KLD_05 : %10.6f\n", percentile(0.05f));
+    printf("KLD_10 : %10.6f\n", percentile(0.10f));

 }

@@ -1972,20 +1852,12 @@ int main(int argc, char ** argv) {

    const int32_t n_ctx = params.n_ctx;

-    if (n_ctx <= 0) {
-        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
-        return 1;
-    }
-
    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
-
    if (ppl) {
-        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
-        const int32_t n_kv = n_seq * n_ctx;
-
+        int n_seq = std::max(1, params.n_batch / n_ctx);
+        int32_t n_kv = n_seq * n_ctx;
        params.n_parallel = n_seq;
-        params.n_ctx      = n_kv;
-
+        params.n_ctx = n_kv;
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -23,7 +23,7 @@
 #endif

 struct quantize_stats_params {
-    std::string model = DEFAULT_MODEL_PATH;
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -1,8 +1,6 @@
 # quantize

-You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
-
-Note: It is synced from llama.cpp `main` every 6 hours.
+TODO

 ## Llama 2 7B

--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
+#include <algorithm>

 struct quant_option {
    std::string name;
@@ -46,17 +47,12 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
-    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "14.00G, -0.0020 ppl @ Mistral-7B", },
-    { "BF16",   LLAMA_FTYPE_MOSTLY_BF16,   "14.00G, -0.0050 ppl @ Mistral-7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };

-static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
-static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
-static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
-static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";

 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;
@@ -101,7 +97,6 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --keep-split: will generate quatized model in the same shards as input");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -117,7 +112,7 @@ static void usage(const char * executable) {
    exit(1);
 }

-static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -164,33 +159,18 @@ static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_
            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
        }
    }
-
-    // latest imatrix version contains the dataset filename at the end of the file
-    int m_last_call = 0;
-    if (in.peek() != EOF) {
-        in.read((char *)&m_last_call, sizeof(m_last_call));
-        int dataset_len;
-        in.read((char *)&dataset_len, sizeof(dataset_len));
-        std::vector<char> dataset_as_vec(dataset_len);
-        in.read(dataset_as_vec.data(), dataset_len);
-        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
-        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
-    }
-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
-    return m_last_call;
+    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
 }

-static int prepare_imatrix(const std::string & imatrix_file,
-        std::string & imatrix_dataset,
+static void prepare_imatrix(const std::string & imatrix_file,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
-    int m_last_call = -1;
    if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+        load_imatrix(imatrix_file, imatrix_data);
    }
    if (imatrix_data.empty()) {
-        return m_last_call;
+        return;
    }
    if (!excluded_weights.empty()) {
        for (auto& name : excluded_weights) {
@@ -216,7 +196,6 @@ static int prepare_imatrix(const std::string & imatrix_file,
    if (!imatrix_data.empty()) {
        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
    }
-    return m_last_call;
 }

 static ggml_type parse_ggml_type(const char * arg) {
@@ -231,6 +210,43 @@ static ggml_type parse_ggml_type(const char * arg) {
    return result;
 }

+static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char* sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.int_value = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.float_value = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.bool_value = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.bool_value = false;
+        } else {
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+    } else {
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -284,8 +300,6 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
-        } else if (strcmp(argv[arg_idx], "--keep-split")) {
-            params.keep_split = true;
        } else {
            usage(argv[0]);
        }
@@ -299,43 +313,10 @@ int main(int argc, char ** argv) {
        usage(argv[0]);
    }

-    std::string imatrix_dataset;
    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
+    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
-        {
-            llama_model_kv_override kvo;
-            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
-            kvo.val_str[127] = '\0';
-            kv_overrides.emplace_back(std::move(kvo));
-        }
-        if (!imatrix_dataset.empty()) {
-            llama_model_kv_override kvo;
-            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
-            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
-            kvo.val_str[127] = '\0';
-            kv_overrides.emplace_back(std::move(kvo));
-        }
-
-        {
-            llama_model_kv_override kvo;
-            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = imatrix_data.size();
-            kv_overrides.emplace_back(std::move(kvo));
-        }
-
-        if (m_last_call > 0) {
-            llama_model_kv_override kvo;
-            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = m_last_call;
-            kv_overrides.emplace_back(std::move(kvo));
-        }
    }
    if (!kv_overrides.empty()) {
        kv_overrides.emplace_back();
@@ -351,28 +332,20 @@ int main(int argc, char ** argv) {
    std::string fname_out;

    std::string ftype_str;
-    std::string suffix = ".gguf";
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
        const size_t pos = fname_inp.find_last_of("/\\");
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-
-        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
-        fname_out = fpath + "ggml-model-" + ftype_str;
-        if (!params.keep_split) {
-            fname_out += suffix;
-        }
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
    } else {
        fname_out = argv[arg_idx];
-        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
-            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
-        }
        arg_idx++;

        if (argc <= arg_idx) {
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-set -eu
-
-if [ $# -lt 1 ]
-then
-    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
-    echo "example: $0 ../../build/bin ../../tmp"
-    exit 1
-fi
-
-if [ $# -gt 1 ]
-then
-    TMP_DIR=$2
-else
-    TMP_DIR=/tmp
-fi
-
-set -x
-
-SPLIT=$1/gguf-split
-QUANTIZE=$1/quantize
-MAIN=$1/main
-WORK_PATH=$TMP_DIR/quantize
-ROOT_DIR=$(realpath $(dirname $0)/../../)
-
-mkdir -p "$WORK_PATH"
-
-# Clean up in case of previously failed test
-rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
-
-# 1. Get a model
-(
-cd $WORK_PATH
-"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
-)
-echo PASS
-
-# 2. Split model
-$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
-echo PASS
-echo
-
-# 3. Requant model with '--keep_split'
-$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
-echo PASS
-echo
-
-# 3a. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
-echo PASS
-echo
-
-# 4. Requant mode without '--keep_split'
-$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
-echo PASS
-echo
-
-# 4b. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
-echo PASS
-echo
-
-# Clean up
-rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@@ -8,7 +8,7 @@ print(subprocess.check_output(
        "python",
        os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
-        "json_schema_to_grammar.py"),
+        "json-schema-to-grammar.py"),
        *rest,
        "-",
        "--raw-pattern",
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -1,2 +0,0 @@
-add_executable(rpc-server rpc-server.cpp)
-target_link_libraries(rpc-server PRIVATE ggml llama)
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -1,74 +0,0 @@
-## Overview
-
-The `rpc-server` allows  running `ggml` backend on a remote host.
-The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
-This can be used for distributed LLM inference with `llama.cpp` in the following way:
-
-```mermaid
-flowchart TD
-    rpcb---|TCP|srva
-    rpcb---|TCP|srvb
-    rpcb-.-|TCP|srvn
-    subgraph hostn[Host N]
-    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
-    end
-    subgraph hostb[Host B]
-    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
-    end
-    subgraph hosta[Host A]
-    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
-    end
-    subgraph host[Main Host]
-    ggml[llama.cpp]---rpcb[RPC backend]
-    end
-    style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
-```
-
-Each host can run a different backend, e.g. one with CUDA and another with Metal.
-You can also run multiple `rpc-server` instances on the same host, each with a different backend.
-
-## Usage
-
-On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
-For example, to build the CUDA backend with RPC support:
-
-```bash
-mkdir build-rpc-cuda
-cd build-rpc-cuda
-cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
-cmake --build . --config Release
-```
-
-Then, start the `rpc-server` with the backend:
-
-```bash
-$ bin/rpc-server -p 50052
-create_backend: using CUDA backend
-ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no
-ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
-ggml_cuda_init: found 1 CUDA devices:
-  Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
-Starting RPC server on 0.0.0.0:50052
-```
-
-When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
-```bash
-$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
-```
-This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
-
-
-On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
-
-```bash
-mkdir build-rpc
-cd build-rpc
-cmake .. -DLLAMA_RPC=ON
-cmake --build . --config Release
-```
-
-Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
-
-```bash
-$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
-```
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -1,130 +0,0 @@
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include "ggml-rpc.h"
-#ifdef _WIN32
-#  include <windows.h>
-#else
-#  include <unistd.h>
-#endif
-#include <string>
-#include <stdio.h>
-
-struct rpc_server_params {
-    std::string host        = "0.0.0.0";
-    int         port        = 50052;
-    size_t      backend_mem = 0;
-};
-
-static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
-    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
-    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
-    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
-    fprintf(stderr, "\n");
-}
-
-static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg == "-H" || arg == "--host") {
-            if (++i >= argc) {
-                return false;
-            }
-            params.host = argv[i];
-        } else if (arg == "-p" || arg == "--port") {
-            if (++i >= argc) {
-                return false;
-            }
-            params.port = std::stoi(argv[i]);
-            if (params.port <= 0 || params.port > 65535) {
-                return false;
-            }
-        } else if (arg == "-m" || arg == "--mem") {
-            if (++i >= argc) {
-                return false;
-            }
-            params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-    return true;
-}
-
-static ggml_backend_t create_backend() {
-    ggml_backend_t backend = NULL;
-#ifdef GGML_USE_CUDA
-    fprintf(stderr, "%s: using CUDA backend\n", __func__);
-    backend = ggml_backend_cuda_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-    }
-#elif GGML_USE_METAL
-    fprintf(stderr, "%s: using Metal backend\n", __func__);
-    backend = ggml_backend_metal_init();
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-    }
-#endif
-
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!backend) {
-        fprintf(stderr, "%s: using CPU backend\n", __func__);
-        backend = ggml_backend_cpu_init();
-    }
-    return backend;
-}
-
-static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
-#ifdef GGML_USE_CUDA
-    ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
-#else
-    #ifdef _WIN32
-        MEMORYSTATUSEX status;
-        status.dwLength = sizeof(status);
-        GlobalMemoryStatusEx(&status);
-        *total_mem = status.ullTotalPhys;
-        *free_mem = status.ullAvailPhys;
-    #else
-        long pages = sysconf(_SC_PHYS_PAGES);
-        long page_size = sysconf(_SC_PAGE_SIZE);
-        *total_mem = pages * page_size;
-        *free_mem = *total_mem;
-    #endif
-#endif
-}
-
-int main(int argc, char * argv[]) {
-    rpc_server_params params;
-    if (!rpc_server_params_parse(argc, argv, params)) {
-        fprintf(stderr, "Invalid parameters\n");
-        return 1;
-    }
-    ggml_backend_t backend = create_backend();
-    if (!backend) {
-        fprintf(stderr, "Failed to create backend\n");
-        return 1;
-    }
-    std::string endpoint = params.host + ":" + std::to_string(params.port);
-    size_t free_mem, total_mem;
-    if (params.backend_mem > 0) {
-        free_mem = params.backend_mem;
-        total_mem = params.backend_mem;
-    } else {
-        get_backend_memory(&free_mem, &total_mem);
-    }
-    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
-    start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
-    ggml_backend_free(backend);
-    return 0;
-}
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,34 +1,17 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-set(TARGET_SRCS
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET}
    server.cpp
    utils.hpp
    httplib.h
 )
-set(PUBLIC_ASSETS
-    index.html
-    index.js
-    completion.js
-    json-schema-to-grammar.mjs
-)
-foreach(asset ${PUBLIC_ASSETS})
-    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
-    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
-    list(APPEND TARGET_SRCS ${output})
-    add_custom_command(
-        DEPENDS "${input}"
-        OUTPUT "${output}"
-        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
-    )
-endforeach()
-add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -11,14 +11,12 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 * Continuous batching
 * Multimodal (wip)
 * Monitoring endpoints
- * Schema-constrained JSON response format

 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).

 **Command line options:**

- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
+- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
 - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
@@ -37,7 +35,9 @@ The project is under active development, and we are [looking for feedback and co
 - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
 - `--numa distribute`: Spread execution evenly over all nodes
 - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
+- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
+page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
+
 - `--numa`: Attempt optimizations that may help on some NUMA systems.
 - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@@ -47,7 +47,7 @@ The project is under active development, and we are [looking for feedback and co
 - `--path`: Path from which to serve static files. Default: disabled
 - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
 - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
+- `--embedding`: Enable embedding extraction. Default: disabled
 - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
 - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching).  Default: disabled
 - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
@@ -61,18 +61,6 @@ The project is under active development, and we are [looking for feedback and co
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 - `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)

 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@@ -85,18 +73,15 @@ The project is under active development, and we are [looking for feedback and co
 - Using `make`:

  ```bash
-  make server
+  make
  ```

 - Using `CMake`:

  ```bash
-  cmake -B build
-  cmake --build build --config Release -t server
+  cmake --build . --config Release
  ```

-  Binary is at `./build/bin/server`
-
 ## Build with SSL

 `server` can also be built with SSL support using OpenSSL 3
@@ -113,8 +98,10 @@ The project is under active development, and we are [looking for feedback and co
 - Using `CMake`:

  ```bash
-  cmake -B build -DLLAMA_SERVER_SSL=ON
-  cmake --build build --config Release -t server
+  mkdir build
+  cd build
+  cmake .. -DLLAMA_SERVER_SSL=ON
+  make server
  ```

 ## Quick Start
@@ -263,15 +250,13 @@ node index.js

    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar

-    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
-
    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.

    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`

    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`

-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
+    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`

    `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`

@@ -330,7 +315,7 @@ Notice that each `probs` is an array of length `n_probs`.

    `content`: Set the text to tokenize.

-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    Note that a special `BOS` token is never inserted.

 - **POST** `/detokenize`: Convert tokens to text.

@@ -380,8 +365,6 @@ Notice that each `probs` is an array of length `n_probs`.

    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

-    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
-
    *Examples:*

    You can use either Python `openai` library with appropriate checkpoints:
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -268,7 +268,6 @@ def start_server_background(args):
    server_args.extend(['--defrag-thold', "0.1"])
    server_args.append('--cont-batching')
    server_args.append('--metrics')
-    server_args.append('--flash-attn')
    server_args.extend(['--log-format', "text"])
    args = [str(arg) for arg in [server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -90,8 +90,7 @@ export default function () {
        "model": model,
        "stream": true,
        "seed": 42,
-        "max_tokens": max_tokens,
-        "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
+        "max_tokens": max_tokens
    }

    const params = {method: 'POST', body: JSON.stringify(payload)};
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -0,0 +1,496 @@
+unsigned char completion_js[] = {
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
+  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
+  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
+  0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
+  0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
+  0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
+  0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
+  0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
+  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
+  0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
+  0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
+  0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
+  0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
+  0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
+  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
+  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
+  0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
+  0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
+  0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
+  0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
+  0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
+  0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
+  0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
+  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
+  0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
+  0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
+  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
+  0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
+  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
+  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
+  0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
+  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d,
+  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f,
+  0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
+  0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43,
+  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
+  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
+  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75,
+  0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
+  0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77,
+  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60, 0x24,
+  0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x60, 0x2c, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a,
+  0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e,
+  0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61,
+  0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f,
+  0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c,
+  0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70,
+  0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70,
+  0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70,
+  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65,
+  0x79, 0x20, 0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72,
+  0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42,
+  0x65, 0x61, 0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
+  0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60,
+  0x7d, 0x20, 0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
+  0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
+  0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20,
+  0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65,
+  0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e,
+  0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
+  0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54,
+  0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
+  0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
+  0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42,
+  0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61,
+  0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
+  0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20,
+  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
+  0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65,
+  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66,
+  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74,
+  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
+  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64,
+  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c,
+  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65,
+  0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
+  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75,
+  0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72,
+  0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e,
+  0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
+  0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65,
+  0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27,
+  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74,
+  0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e,
+  0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78,
+  0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27,
+  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64,
+  0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65,
+  0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69,
+  0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70,
+  0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69,
+  0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74,
+  0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74,
+  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
+  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
+  0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
+  0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f,
+  0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65,
+  0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20,
+  0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
+  0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20,
+  0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74,
+  0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
+  0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e,
+  0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29,
+  0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
+  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20,
+  0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63,
+  0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63,
+  0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
+  0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
+  0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63,
+  0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
+  0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a,
+  0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74,
+  0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64,
+  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
+  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a,
+  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
+  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
+  0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
+  0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
+  0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
+  0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73,
+  0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
+  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
+  0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74,
+  0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
+  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20,
+  0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
+  0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a,
+  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
+  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
+  0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73,
+  0x61, 0x67, 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
+  0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
+  0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77,
+  0x20, 0x61, 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f,
+  0x20, 0x62, 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62,
+  0x79, 0x20, 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63,
+  0x61, 0x6c, 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
+  0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72,
+  0x6f, 0x72, 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61,
+  0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
+  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
+  0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24,
+  0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f,
+  0x72, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
+  0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
+  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
+  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
+  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
+  0x72, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
+  0x72, 0x72, 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e,
+  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62,
+  0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
+  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c,
+  0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22,
+  0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b,
+  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c,
+  0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72,
+  0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61,
+  0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61,
+  0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75,
+  0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
+  0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a,
+  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61,
+  0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65,
+  0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
+  0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
+  0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74,
+  0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
+  0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74,
+  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
+  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
+  0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
+  0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65,
+  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e,
+  0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79,
+  0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
+  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f,
+  0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c,
+  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
+  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65,
+  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64,
+  0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
+  0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
+  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
+  0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
+  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
+  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
+  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
+  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
+  0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
+  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
+  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
+  0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65,
+  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73,
+  0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e,
+  0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65,
+  0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22,
+  0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
+  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
+  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
+  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
+  0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
+  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43,
+  0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d,
+  0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73,
+  0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74,
+  0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f,
+  0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f,
+  0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
+  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69,
+  0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74,
+  0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
+  0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
+  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
+  0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
+  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
+  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66,
+  0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
+  0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61,
+  0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
+  0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77,
+  0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
+  0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
+  0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
+  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
+  0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
+  0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65,
+  0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d,
+  0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64,
+  0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20,
+  0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d,
+  0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
+  0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
+  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63,
+  0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
+  0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b,
+  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
+  0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61,
+  0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f,
+  0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64,
+  0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e,
+  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65,
+  0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74,
+  0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61,
+  0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78,
+  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66,
+  0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x63,
+  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
+  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
+  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x61, 0x70, 0x69,
+  0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
+  0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c,
+  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, 0x3d, 0x20, 0x61,
+  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60,
+  0x24, 0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x70,
+  0x72, 0x6f, 0x70, 0x73, 0x60, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
+  0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28,
+  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
+  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e,
+  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65,
+  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72,
+  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x73, 0x3b, 0x0a, 0x7d, 0x0a
+};
+unsigned int completion_js_len = 5909;
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -8,3 +8,13 @@ PUBLIC=$DIR/public
 echo "download js bundle files"
 curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
 echo >> $PUBLIC/index.js # add newline
+
+FILES=$(ls $PUBLIC)
+
+cd $PUBLIC
+for FILE in $FILES; do
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
+done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/favicon.ico
+++ b/examples/server/public/favicon.ico
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -51,6 +51,26 @@
      margin-bottom: 0.5em;
    }

+  button, input, textarea, .button, a.button, select {
+      color: #666;
+      border: 1px solid #ddd;
+      border-radius: 4px;
+      line-height: 1.5em;
+      padding: 0.25em 0.25em;
+      text-decoration: none;
+      font-size: 1.1rem;
+    }
+
+    button {
+      border: 1px solid #2a8aad;
+      background: #3584e4;
+      font-weight: normal;
+      color: #fff;
+    }
+    button:disabled {
+      background: #9cbce5;
+    }
+
    #write form {
      margin: 1em 0 0 0;
      display: flex;
@@ -567,7 +587,7 @@
        runCompletion();
      }
      return html`
-        <div>
+        <div class="right">
          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
          <button onclick=${reset}>Reset</button>
@@ -881,11 +901,11 @@
        .replace(/&/g, '&amp;')
        .replace(/</g, '&lt;')
        .replace(/>/g, '&gt;')
-        .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
-        .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
-        .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
-        .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
-        .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
+        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*/g, '<em>$1</em>')
+        .replace(/_(.*?)_/g, '<em>$1</em>')
        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
        .replace(/`(.*?)`/g, '<code>$1</code>')
        .replace(/\n/gim, '<br />');
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -1,95 +1,33 @@
-// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
+// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
 const SPACE_RULE = '" "?';

-function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
-  const separatorRule = opts.separatorRule ?? '';
-  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
-
-  if (separatorRule === '') {
-    if (minItems === 0 && maxItems === 1) {
-      return `${itemRule}?`;
-    } else if (minItems === 1 && maxItems === undefined) {
-      return `${itemRule}+`;
-    }
-  }
-
-  let result = '';
-  if (minItems > 0) {
-    if (itemRuleIsLiteral && separatorRule === '') {
-      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
-    } else {
-      result = Array.from({ length: minItems }, () => itemRule)
-        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
-    }
-  }
-
-  const optRepetitions = (upToN, prefixWithSep=false) => {
-    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
-    if (upToN === 0) {
-      return '';
-    } else if (upToN === 1) {
-      return `(${content})?`;
-    } else if (separatorRule !== '' && !prefixWithSep) {
-      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
-    } else {
-      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
-    }
-  };
-
-  if (minItems > 0 && maxItems !== minItems) {
-    result += ' ';
-  }
-
-  if (maxItems !== undefined) {
-    result += optRepetitions(maxItems - minItems, minItems > 0);
-  } else {
-    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
-
-    if (minItems === 0 && separatorRule !== '') {
-      result = `(${itemRule} ${itemOperator}*)?`;
-    } else {
-      result += `${itemOperator}*`;
-    }
-  }
-
-  return result;
-}
-
-class BuiltinRule {
-  constructor(content, deps) {
-    this.content = content;
-    this.deps = deps || [];
-  }
-}
-
-const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
-
 const PRIMITIVE_RULES = {
-  boolean        : new BuiltinRule('("true" | "false") space', []),
-  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
-  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
-  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
-  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
-  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
-  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
-  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
-  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
-  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
-  null           : new BuiltinRule('"null" space', []),
+  boolean: '("true" | "false") space',
+  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  value: 'object | array | string | number | boolean',
+  object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+  array: '"[" space ( value ("," space value)* )? "]" space',
+  uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
+  string: ` "\\"" (
+        [^"\\\\] |
+        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\\"" space`,
+  null: '"null" space',
 };
+const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];

 // TODO: support "uri", "email" string formats
-const STRING_FORMAT_RULES = {
-  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
-  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
-  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
-  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
-  'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
-}
+const DATE_RULES = {
+    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+    'date-time': 'date "T" time',
+    'date-string': '"\\"" date "\\"" space',
+    'time-string': '"\\"" time "\\"" space',
+    'date-time-string': '"\\"" date-time "\\"" space',
+};

-const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};

 const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
 const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
@@ -220,7 +158,7 @@ export class SchemaConverter {
        rule = '[\\U00000000-\\U0010FFFF]';
      } else {
        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
-        rule = '[^\\x0A\\x0D]';
+        rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
      }
      return this._addRule('dot', rule);
    };
@@ -321,19 +259,26 @@ export class SchemaConverter {

          let [sub, subIsLiteral] = seq[seq.length - 1];

-          if (!subIsLiteral) {
-            let id = subRuleIds[sub];
-            if (id === undefined) {
-              id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
-              subRuleIds[sub] = id;
+          if (minTimes === 0 && maxTimes === Infinity) {
+            seq[seq.length - 1] = [`${sub}*`, false];
+          } else if (minTimes === 0 && maxTimes === 1) {
+            seq[seq.length - 1] = [`${sub}?`, false];
+          } else if (minTimes === 1 && maxTimes === Infinity) {
+            seq[seq.length - 1] = [`${sub}+`, false];
+          } else {
+            if (!subIsLiteral) {
+              let id = subRuleIds[sub];
+              if (id === undefined) {
+                id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+                subRuleIds[sub] = id;
+              }
+              sub = id;
            }
-            sub = id;
-          }

-          seq[seq.length - 1] = [
-            _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
-            false
-          ];
+            const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
+            const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
+            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
+          }
        } else {
          let literal = '';
          while (i < length) {
@@ -449,50 +394,49 @@ export class SchemaConverter {
        );
      } else {
        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
-        const minItems = schema.minItems || 0;
+        const listItemOperator = `( "," space ${itemRuleName} )`;
+        let successiveItems = '';
+        let minItems = schema.minItems || 0;
        const maxItems = schema.maxItems;
-        return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
+        if (minItems > 0) {
+          successiveItems = listItemOperator.repeat(minItems - 1);
+          minItems--;
+        }
+        if (maxItems !== undefined && maxItems > minItems) {
+          successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
+        } else {
+          successiveItems += `${listItemOperator}*`;
+        }
+        const rule = minItems === 0
+          ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
+          : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
+        return this._addRule(ruleName, rule);
      }
    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
      return this._visitPattern(schema.pattern, ruleName);
    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
-      return this._addPrimitive(
-        ruleName === 'root' ? 'root' : schemaFormat,
-        PRIMITIVE_RULES['uuid']
-      );
-    } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
-      const primName = `${schema.format}-string`
-      return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
-    } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
-      const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
-      const minLen = schema.minLength || 0;
-      const maxLen = schema.maxLength;
-      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
+      return this._addRule(
+          ruleName === 'root' ? 'root' : schemaFormat,
+          PRIMITIVE_RULES['uuid'])
+    } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
+      for (const [t, r] of Object.entries(DATE_RULES)) {
+        this._addRule(t, r);
+      }
+      return schemaFormat + '-string';
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
-      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
+      for (const n of OBJECT_RULE_NAMES) {
+        this._addRule(n, PRIMITIVE_RULES[n]);
+      }
+      return this._addRule(ruleName, 'object');
    } else {
      if (!(schemaType in PRIMITIVE_RULES)) {
        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
      }
      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-      return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+      return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
    }
  }

-  _addPrimitive(name, rule) {
-    let n = this._addRule(name, rule.content);
-    for (const dep of rule.deps) {
-      const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
-      if (!depRule) {
-        throw new Error(`Rule ${dep} not known`);
-      }
-      if (!(dep in this._rules)) {
-        this._addPrimitive(dep, depRule);
-      }
-    }
-    return n;
-  }
-
  _buildObjectRule(properties, required, name, additionalProperties) {
    const propOrder = this._propOrder;
    // sort by position in prop_order (if specified) then by original order
@@ -518,7 +462,7 @@ export class SchemaConverter {
      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
      propKvRuleNames['*'] = this._addRule(
        `${subName}-kv`,
-        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
      optionalProps.push('*');
    }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Aidan	124e4dced2	Update	2024-04-22 10:42:32 +01:00
Aidan	907df4459c	Hack test-bench	2024-04-12 11:26:36 +01:00