chore: fix typo in llama.cpp (#7032 )

Co-authored-by: Jared Van Bortel <jared@nomic.ai>
Update LOG_IMPL and LOG_TEE_IMPL (#7029 )
2026-02-12 14:03:20 +02:00 · 2024-05-02 11:56:41 -04:00 · 2024-05-01 23:31:30 +02:00 · 2024-05-01 22:27:41 +03:00 · 2024-05-01 17:52:55 +02:00 · 2024-05-01 14:46:37 +02:00
212 changed files with 19206 additions and 12783 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -28,6 +28,8 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1

 RUN make

--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make

 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev

 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -15,6 +15,9 @@ WORKDIR /app

 COPY . .

+ENV LLAMA_CURL=1
+
+
 RUN make

 ENV LC_ALL=C.utf8
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -10,14 +10,12 @@ WORKDIR /app

 COPY . .

-RUN mkdir build && \
-    cd build && \
-    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build . --config Release --target main
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target main

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -14,10 +14,8 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
-    cmake --build . --config Release --target main
+RUN cmake -B build -DLLAMA_VULKAN=1 && \
+    cmake --build build --config Release --target main

 # Clean up
 WORKDIR /
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev

 WORKDIR /app

@@ -22,11 +22,16 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1

 RUN make

 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server

 ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -4,23 +4,24 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build

 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git
+    apt-get install -y git libcurl4-openssl-dev

 WORKDIR /app

 COPY . .

-RUN mkdir build && \
-    cd build && \
-    if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
        echo "LLAMA_SYCL_F16 is set" && \
        export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
    fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
-    cmake --build . --config Release --target server
+    cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target server

 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime

+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/build/bin/server /server

 ENV LC_ALL=C.utf8
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++

+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make

 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server-vulkan.Dockerfile
+++ b/.devops/server-vulkan.Dockerfile
@@ -11,13 +11,15 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
    apt update -y && \
    apt-get install -y vulkan-sdk

+# Install cURL
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 # Build it
 WORKDIR /app
 COPY . .
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
-    cmake --build . --config Release --target server
+RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release --target server

 # Clean up
 WORKDIR /
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -32,7 +32,7 @@ on:
    -  cron: '04 2 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true

 jobs:
@@ -79,20 +79,24 @@ jobs:
            sleep 0.1
          done

-      - name: Install k6
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
          cd examples/server/bench
-          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
-          tar xzf k6*.tar.gz --strip-components=1
+          go install go.k6.io/xk6/cmd/xk6@latest
+          xk6 build master \
+              --with github.com/phymbert/xk6-sse

      - name: Build
        id: cmake_build
        run: |
          set -eux
-          mkdir build
-          cd build
-          cmake .. \
+          cmake -B build \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
@@ -103,7 +107,7 @@ jobs:
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
-          cmake --build . --config Release -j $(nproc) --target server
+          cmake --build build --config Release -j $(nproc) --target server

      - name: Download the dataset
        id: download_dataset
@@ -118,7 +122,7 @@ jobs:

          cd examples/server/bench
          source venv/bin/activate
-          BENCH_K6_BIN_PATH=./k6 python bench.py \
+          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch ${{ github.head_ref || github.ref_name }} \
@@ -228,9 +232,9 @@ jobs:
            <summary>Expand details for performance related PR only</summary>

            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
            - ${{ env.BENCH_GRAPH_XLABEL }}


--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,6 +32,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -52,7 +54,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

      - name: Determine tag name
        id: tag
@@ -88,6 +90,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -101,7 +105,9 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -204,26 +210,28 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

      - name: Test llama2c conversion
        id: llama2c_test
@@ -236,6 +244,33 @@ jobs:
          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+          name: llama-bin-ubuntu-x64.zip
+
 #  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
@@ -558,6 +593,63 @@ jobs:
        run: |
            make swift

+  windows-msys2:
+    runs-on: windows-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
+          - { sys: CLANG64, env: clang-x86_64, build: Release }
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+
+      - name: Setup ${{ matrix.sys }}
+        uses: msys2/setup-msys2@v2
+        with:
+          update: true
+          msystem: ${{matrix.sys}}
+          install: >-
+            base-devel
+            mingw-w64-${{matrix.env}}-toolchain
+            mingw-w64-${{matrix.env}}-cmake
+            mingw-w64-${{matrix.env}}-openblas
+
+      - name: Build using make
+        shell: msys2 {0}
+        run: |
+            make -j $(nproc)
+
+      - name: Clean after building using make
+        shell: msys2 {0}
+        run: |
+            make clean
+
+      - name: Build using make w/ OpenBLAS
+        shell: msys2 {0}
+        run: |
+            make LLAMA_OPENBLAS=1 -j $(nproc)
+
+      - name: Build using CMake
+        shell: msys2 {0}
+        run: |
+            cmake -B build
+            cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
+      - name: Clean after building using CMake
+        shell: msys2 {0}
+        run: |
+            rm -rf build
+
+      - name: Build using CMake w/ OpenBLAS
+        shell: msys2 {0}
+        run: |
+            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+            cmake --build build --config ${{ matrix.build }} -j $(nproc)
+
  windows-latest-cmake:
    runs-on: windows-latest

@@ -938,6 +1030,12 @@ jobs:
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release

      - name: Create release
        id: create_release
@@ -956,7 +1054,7 @@ jobs:
            const path = require('path');
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact')) {
+            for (let file of await fs.readdirSync('./artifact/release')) {
              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
@@ -964,7 +1062,7 @@ jobs:
                  repo: context.repo.repo,
                  release_id: release_id,
                  name: file,
-                  data: await fs.readFileSync(`./artifact/${file}`)
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
                });
              }
            }
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -12,7 +12,7 @@ jobs:
    steps:
      - uses: actions/stale@v5
        with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research"
+          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -91,6 +91,12 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi

+      - name: Downcase github.repository_owner
+        run: |
+          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@@ -98,7 +104,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -107,5 +113,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -21,4 +21,4 @@ jobs:
        uses: py-actions/flake8@v2
        with:
            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py"
+            exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -23,7 +23,7 @@ on:
    -  cron: '2 4 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -41,23 +41,16 @@ jobs:
            sanitizer: ""
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

-    container:
-      image: ubuntu:latest
-      ports:
-        - 8888
-      options: --cpus 4
-
    steps:
      - name: Dependencies
        id: depends
        run: |
-          apt-get update
-          apt-get -y install \
+          sudo apt-get update
+          sudo apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
-            python3-pip \
            curl \
            wget \
            language-pack-en \
@@ -70,6 +63,17 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

+      - name: Python setup
+        id: setup_python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r examples/server/tests/requirements.txt
+
      - name: Verify server deps
        id: verify_server_deps
        run: |
@@ -90,20 +94,14 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
-          cd build
-          cmake .. \
+          cmake -B build \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server

-      - name: Tests dependencies
-        id: test_dependencies
-        run: |
-          pip install -r examples/server/tests/requirements.txt

      - name: Tests
        id: server_integration_tests
@@ -129,6 +127,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}

      - name: libCURL
        id: get_libcurl
@@ -142,10 +141,8 @@ jobs:
      - name: Build
        id: cmake_build
        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
+          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server

      - name: Python setup
        id: setup_python
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 *.a
 *.so
 *.gguf
+*.gguf.json
 *.bin
 *.exe
 *.dll
@@ -34,6 +35,7 @@ lcov-report/
 gcovr-report/

 build*
+!build.zig
 cmake-build-*
 out/
 tmp/
@@ -48,6 +50,7 @@ models-mnt
 /convert-llama2c-to-ggml
 /embd-input-test
 /embedding
+/eval-callback
 /gguf
 /gguf-llama-simple
 /gguf-split
@@ -99,7 +102,25 @@ qnt-*.txt
 perf-*.txt

 examples/jeopardy/results.txt
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp

 poetry.lock
 poetry.toml
 nppBackup
+
+# Test binaries
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-sampling
+/tests/test-tokenizer-0
+/tests/test-tokenizer-1-spm
+/tests/test-tokenizer-1-bpe
+/tests/test-rope
+/tests/test-backend-ops
--- a/655
+++ b/655
@@ -0,0 +1,655 @@
+# date: Tue Apr  9 09:17:14 EEST 2024
+# this file is auto-generated by scripts/gen-authors.sh
+
+0cc4m <picard12@live.de>
+0xspringtime <110655352+0xspringtime@users.noreply.github.com>
+2f38b454 <dxf@protonmail.com>
+3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
+44670 <44670@users.noreply.github.com>
+AN Long <aisk@users.noreply.github.com>
+AT <manyoso@users.noreply.github.com>
+Aarni Koskela <akx@iki.fi>
+Aaron Miller <apage43@ninjawhale.com>
+Aaryaman Vasishta <aaryaman.vasishta@amd.com>
+Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
+Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
+Adithya Balaji <adithya.b94@gmail.com>
+AdithyanI <adithyan.i4internet@gmail.com>
+Adrian <smith.adriane@gmail.com>
+Adrian Hesketh <a-h@users.noreply.github.com>
+AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
+Aisuko <urakiny@gmail.com>
+Alberto <57916483+albbus-stack@users.noreply.github.com>
+Alex <awhill19@icloud.com>
+Alex Azarov <alex@azarov.by>
+Alex Azarov <alexander.azarov@mapbox.com>
+Alex Klinkhamer <from.github.com.917@grencez.dev>
+Alex Klinkhamer <git@grencez.dev>
+Alex Nguyen <tiendung@users.noreply.github.com>
+Alex Petenchea <alex.petenchea@gmail.com>
+Alex Renda <alexrenda@users.noreply.github.com>
+Alex von Gluck IV <kallisti5@unixzen.com>
+Alexey Parfenov <zxed@alkatrazstudio.net>
+Ali Chraghi <63465728+alichraghi@users.noreply.github.com>
+Ali Nehzat <ali.nehzat@thanks.dev>
+Ali Tariq <ali.tariq@10xengineers.ai>
+Alon <alonfaraj@gmail.com>
+AlpinDale <52078762+AlpinDale@users.noreply.github.com>
+AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
+Ananta Bastola <anantarajbastola@gmail.com>
+Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
+András Salamon <ott2@users.noreply.github.com>
+Andrei <abetlen@gmail.com>
+Andrew Canis <andrew.canis@gmail.com>
+Andrew Duffy <a10y@users.noreply.github.com>
+Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
+Arik Poznanski <arikpoz@users.noreply.github.com>
+Artem <guinmoon@gmail.com>
+Artyom Lebedev <vagran.ast@gmail.com>
+Asbjørn Olling <asbjornolling@gmail.com>
+Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
+Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
+Ashraful Islam <ashraful.meche@gmail.com>
+Atsushi Tatsuma <yoshoku@outlook.com>
+Austin <77757836+teleprint-me@users.noreply.github.com>
+AustinMroz <austinmroz@utexas.edu>
+BADR <contact@pythops.com>
+Bach Le <bach@bullno1.com>
+Bailey Chittle <39804642+bachittle@users.noreply.github.com>
+BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
+Behnam M <58621210+ibehnam@users.noreply.github.com>
+Ben Garney <bengarney@users.noreply.github.com>
+Ben Siraphob <bensiraphob@gmail.com>
+Ben Williams <ben@719ben.com>
+Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
+Bernat Vadell <hounter.caza@gmail.com>
+Bodo Graumann <mail@bodograumann.de>
+Bono Lv <lvscar@users.noreply.github.com>
+Borislav Stanimirov <b.stanimirov@abv.bg>
+Branden Butler <bwtbutler@hotmail.com>
+Brian <mofosyne@gmail.com>
+Bruce MacDonald <brucewmacdonald@gmail.com>
+CJ Pais <cj@cjpais.com>
+CRD716 <crd716@gmail.com>
+Cameron <csteele@steelecameron.com>
+Cameron Kaiser <classilla@users.noreply.github.com>
+Casey Primozic <casey@cprimozic.net>
+Casey Primozic <me@ameo.link>
+CausalLM <148736309+CausalLM@users.noreply.github.com>
+Cebtenzzre <cebtenzzre@gmail.com>
+Chad Brewbaker <crb002@gmail.com>
+Cheng Shao <terrorjack@type.dance>
+Chris Kuehl <ckuehl@ckuehl.me>
+Christian Demsar <christian@github.email.demsar.us>
+Christian Demsar <crasm@git.vczf.us>
+Christian Falch <875252+chrfalch@users.noreply.github.com>
+Christian Kögler <ck3d@gmx.de>
+Clark Saben <76020733+csaben@users.noreply.github.com>
+Clint Herron <hanclinto@gmail.com>
+Cuong Trinh Manh <nguoithichkhampha@gmail.com>
+DAN™ <dranger003@gmail.com>
+Damian Stewart <d@damianstewart.com>
+Dane Madsen <dane_madsen@hotmail.com>
+DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
+Daniel Bevenius <daniel.bevenius@gmail.com>
+Daniel Drake <drake@endlessos.org>
+Daniel Hiltgen <dhiltgen@users.noreply.github.com>
+Daniel Illescas Romero <illescas.daniel@protonmail.com>
+DannyDaemonic <DannyDaemonic@gmail.com>
+Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
+Dave Della Costa <ddellacosta+github@gmail.com>
+David Friehs <david@friehs.info>
+David Kennedy <dakennedyd@gmail.com>
+David Pflug <david@pflug.email>
+David Renshaw <dwrenshaw@gmail.com>
+David Sommers <12738+databyte@users.noreply.github.com>
+David Yang <davidyang6us@gmail.com>
+Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
+Dean <Dean.Sinaean@gmail.com>
+Deins <deinsegle@gmail.com>
+Didzis Gosko <didzis@users.noreply.github.com>
+Don Mahurin <dmahurin@users.noreply.github.com>
+DooWoong Lee (David) <manics99@naver.com>
+Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
+Douglas Hanley <thesecretaryofwar@gmail.com>
+Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
+Ebey Abraham <ebey97@gmail.com>
+Ed Lee <edilee@mozilla.com>
+Ed Lepedus <ed.lepedus@googlemail.com>
+Edward Taylor <edeetee@gmail.com>
+Elbios <141279586+Elbios@users.noreply.github.com>
+Engininja2 <139037756+Engininja2@users.noreply.github.com>
+Equim <sayaka@ekyu.moe>
+Eric Sommerlade <es0m@users.noreply.github.com>
+Eric Zhang <34133756+EZForever@users.noreply.github.com>
+Erik Garrison <erik.garrison@gmail.com>
+Erik Scholz <Green-Sky@users.noreply.github.com>
+Ettore Di Giacinto <mudler@users.noreply.github.com>
+Evan Jones <evan.q.jones@gmail.com>
+Evan Miller <emmiller@gmail.com>
+Eve <139727413+netrunnereve@users.noreply.github.com>
+Evgeny Kurnevsky <kurnevsky@gmail.com>
+Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
+ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com>
+FK <sozforex@gmail.com>
+Fabian <cmdrf@users.noreply.github.com>
+Fabio R. Sluzala <Fabio3rs@users.noreply.github.com>
+Faez Shakil <faez.shakil@gmail.com>
+FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
+Fattire <528174+fat-tire@users.noreply.github.com>
+Felix <stenbackfelix@gmail.com>
+Finn Voorhees <finnvoorhees@gmail.com>
+Firat <firatkiral@gmail.com>
+Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
+Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
+Francisco Melo <43780565+francis2tm@users.noreply.github.com>
+FrankHB <frankhb1989@gmail.com>
+Frederik Vogel <Schaltfehler@users.noreply.github.com>
+Gabe Goodhart <gabe.l.hart@gmail.com>
+GainLee <perfecter.gen@gmail.com>
+Galunid <karolek1231456@gmail.com>
+Gary Linscott <glinscott@gmail.com>
+Gary Mulder <gjmulder@gmail.com>
+Genkagaku.GPT <hlhr202@163.com>
+Georgi Gerganov <ggerganov@gmail.com>
+Gilad S <giladgd@users.noreply.github.com>
+GiviMAD <GiviMAD@users.noreply.github.com>
+Govlzkoy <gotope@users.noreply.github.com>
+Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
+Guillaume Wenzek <gwenzek@users.noreply.github.com>
+Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
+Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
+Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
+Haohui Mai <ricetons@gmail.com>
+Haoxiang Fei <tonyfettes@tonyfettes.com>
+Harald Fernengel <harald.fernengel@here.com>
+Hatsune Miku <129688334+at8u@users.noreply.github.com>
+Henk Poley <HenkPoley@gmail.com>
+Henri Vasserman <henv@hot.ee>
+Henrik Forstén <henrik.forsten@gmail.com>
+Herman Semenov <GermanAizek@yandex.ru>
+Hesen Peng <hesen.peng@gmail.com>
+Hoang Nguyen <hugo53@users.noreply.github.com>
+Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
+Howard Su <howard0su@gmail.com>
+Hua Jiang <allenhjiang@outlook.com>
+Huawei Lin <huaweilin.cs@gmail.com>
+Ian Bull <irbull@eclipsesource.com>
+Ian Bull <irbull@gmail.com>
+Ian Scrivener <github@zilogy.asia>
+Ido S <ido.pluto@gmail.com>
+IgnacioFDM <ignaciofdm@gmail.com>
+Igor Okulist <okigan@gmail.com>
+Ikko Eltociear Ashimine <eltociear@gmail.com>
+Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com>
+Ionoclast Laboratories <brigham@ionoclast.com>
+Isaac McFadyen <isaac@imcf.me>
+IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com>
+Ivan Komarov <Ivan.Komarov@dfyz.info>
+Ivan Stepanov <ivanstepanovftw@gmail.com>
+JH23X <165871467+JH23X@users.noreply.github.com>
+Jack Mousseau <jmousseau@users.noreply.github.com>
+JackJollimore <130917767+JackJollimore@users.noreply.github.com>
+Jag Chadha <jagtesh@gmail.com>
+Jakub N <jakubniemczyk97@gmail.com>
+James Reynolds <magnusviri@users.noreply.github.com>
+Jan Boon <jan.boon@kaetemi.be>
+Jan Boon <kaetemi@gmail.com>
+Jan Ploski <jpl@plosquare.com>
+Jannis Schönleber <joennlae@gmail.com>
+Jared Van Bortel <cebtenzzre@gmail.com>
+Jared Van Bortel <jared@nomic.ai>
+Jason McCartney <jmac@theroot.org>
+Jean-Christophe Hoelt <hoelt@fovea.cc>
+Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
+Jed Fox <git@jedfox.com>
+Jeffrey Quesnelle <emozilla@nousresearch.com>
+Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
+Jhen-Jie Hong <iainst0409@gmail.com>
+Jiahao Li <liplus17@163.com>
+Jian Liao <jianliao@users.noreply.github.com>
+JidongZhang-THU <1119708529@qq.com>
+Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
+Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
+Johannes Gäßler <johannesg@5d6.de>
+Johannes Rudolph <johannes.rudolph@gmail.com>
+John <78893154+cmp-nct@users.noreply.github.com>
+John Balis <phobossystems@gmail.com>
+John Smith <67539080+kingsidelee@users.noreply.github.com>
+JohnnyB <jboero@users.noreply.github.com>
+Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
+Jorge A <161275481+jorgealias@users.noreply.github.com>
+Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
+Joseph Stahl <1269177+josephst@users.noreply.github.com>
+Joyce <joycebrum@google.com>
+Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
+Judd <foldl@users.noreply.github.com>
+Julius Arkenberg <arki05@users.noreply.github.com>
+Jun Jie <71215065+junnjiee16@users.noreply.github.com>
+Juraj Bednar <juraj@bednar.io>
+Justin Parker <jparkerweb@gmail.com>
+Justin Suess <justin.suess@westpoint.edu>
+Justine Tunney <jtunney@gmail.com>
+Juuso Alasuutari <juuso.alasuutari@gmail.com>
+KASR <karim.asrih@gmail.com>
+Kamil Tomšík <info@tomsik.cz>
+Karsten Weiss <knweiss@gmail.com>
+Karthick <j.karthic2004@gmail.com>
+Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com>
+Karthik Sethuraman <k.seth1993@gmail.com>
+Kasumi <90275229+kasumi-1@users.noreply.github.com>
+Kawrakow <48489457+ikawrakow@users.noreply.github.com>
+Keiichi Tabata <keiichi.tabata@outlook.com>
+Kenvix ⭐ <kenvixzure@live.com>
+Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Kevin Ji <1146876+kevinji@users.noreply.github.com>
+Kevin Kwok <antimatter15@gmail.com>
+Kevin Lo <kevlo@kevlo.org>
+Kolen Cheung <ickc@users.noreply.github.com>
+Konstantin Herud <konstantin.herud@denkbares.com>
+Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
+Kunshang Ji <kunshang.ji@intel.com>
+Kyle Liang <liangmanlai@gmail.com>
+Kyle Mistele <kyle@mistele.com>
+Kylin <56434533+KyL0N@users.noreply.github.com>
+Lars Grammel <lars.grammel@gmail.com>
+Laura <Tijntje_7@msn.com>
+Lee <44310445+lx200916@users.noreply.github.com>
+Lee Drake <b.lee.drake@gmail.com>
+Leng Yue <lengyue@lengyue.me>
+LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
+Leonardo Neumann <leonardo@neumann.dev.br>
+Li Tan <tanliboy@gmail.com>
+Linwei Wang <wanix1988@gmail.com>
+LoganDark <github@logandark.mozmail.com>
+LostRuins <39025047+LostRuins@users.noreply.github.com>
+Luciano <lucianostrika44@gmail.com>
+Luo Tian <lt@basecity.com>
+M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
+Maarten ter Huurne <maarten@treewalker.org>
+Mack Straight <eiz@users.noreply.github.com>
+Maël Kerbiriou <m431.kerbiriou@gmail.com>
+MaggotHATE <clay1326@gmail.com>
+Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
+Marco Matthies <71844+marcom@users.noreply.github.com>
+Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
+Marian Cepok <marian.cepok@gmail.com>
+Mark Fairbairn <thebaron88@gmail.com>
+Marko Tasic <mtasic85@gmail.com>
+Martin Krasser <krasserm@googlemail.com>
+Martin Schwaighofer <mschwaig@users.noreply.github.com>
+Marvin Gießing <marvin.giessing@gmail.com>
+Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
+Matheus C. França <matheus-catarino@hotmail.com>
+Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
+Mathieu Nayrolles <MathieuNls@users.noreply.github.com>
+Mathijs de Bruin <mathijs@mathijsfietst.nl>
+Matt Clayton <156335168+mattjcly@users.noreply.github.com>
+Matt Pulver <matt.pulver@heavy.ai>
+Matteo Boschini <12133566+mbosc@users.noreply.github.com>
+Matthew Tejo <matthew.tejo@gmail.com>
+Matvey Soloviev <blackhole89@gmail.com>
+Maxime <672982+maximegmd@users.noreply.github.com>
+Maximilian Winter <maximilian.winter.91@gmail.com>
+Meng Zhang <meng@tabbyml.com>
+Meng, Hengyu <hengyu.meng@intel.com>
+Merrick Christensen <merrick.christensen@gmail.com>
+Michael Coppola <m18coppola@gmail.com>
+Michael Hueschen <m@mhueschen.dev>
+Michael Kesper <mkesper@schokokeks.org>
+Michael Klimenko <mklimenko29@gmail.com>
+Michael Podvitskiy <podvitskiymichael@gmail.com>
+Michael Potter <NanoTekGuy@Gmail.com>
+Michaël de Vries <vriesdemichael@gmail.com>
+Mihai <mihai.chirculescu@yahoo.com>
+Mike <ytianhui2004@gmail.com>
+Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
+Mirko185 <mirkosig@gmail.com>
+Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
+Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
+Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
+Murilo Santana <mvrilo@gmail.com>
+Musab Gultekin <musabgultekin@users.noreply.github.com>
+Nam D. Tran <42194884+namtranase@users.noreply.github.com>
+NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
+Nebula <infinitewormhole@gmail.com>
+Neo Zhang Jianyu <jianyu.zhang@intel.com>
+Neuman Vong <neuman.vong@gmail.com>
+Nexesenex <124105151+Nexesenex@users.noreply.github.com>
+Niall Coates <1349685+Niall-@users.noreply.github.com>
+Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
+Nigel Bosch <pnigelb@gmail.com>
+Niklas Korz <niklas@niklaskorz.de>
+Nindaleth <Nindaleth@users.noreply.github.com>
+Oleksandr Nikitin <oleksandr@tvori.info>
+Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
+Olivier Chafik <ochafik@users.noreply.github.com>
+Ondřej Čertík <ondrej@certik.us>
+Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
+Paul Tsochantaris <ptsochantaris@icloud.com>
+Pavol Rusnak <pavol@rusnak.io>
+Pedro Cuenca <pedro@huggingface.co>
+Peter Sugihara <peter@campsh.com>
+Phil H <5756783+phiharri@users.noreply.github.com>
+Philip Taron <philip.taron@gmail.com>
+Phillip Kravtsov <phillip@kravtsov.net>
+Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
+Pierrick Hymbert <pierrick.hymbert@gmail.com>
+Przemysław Pawełczyk <przemoc@gmail.com>
+Qin Yue Chen <71813199+chenqiny@users.noreply.github.com>
+Qingyou Meng <meng.qingyou@gmail.com>
+Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com>
+RJ Adriaansen <adriaansen@eshcc.eur.nl>
+Radoslav Gerganov <rgerganov@gmail.com>
+Radosław Gryta <radek.gryta@gmail.com>
+Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
+Rand Xie <randxiexyy29@gmail.com>
+Randall Fitzgerald <randall@dasaku.net>
+Reinforce-II <fate@eastal.com>
+Riceball LEE <snowyu.lee@gmail.com>
+Richard Kiss <him@richardkiss.com>
+Richard Roberson <richardr1126@gmail.com>
+Rick G <26732651+TheFlipbook@users.noreply.github.com>
+Rickard Edén <rickardeden@gmail.com>
+Rickard Hallerbäck <rickard.hallerback@gmail.com>
+Rickey Bowers Jr <bitRAKE@gmail.com>
+Riley Stewart <ristew@users.noreply.github.com>
+Rinne <AsakusaRinne@gmail.com>
+Rinne <liu_yaohui1998@126.com>
+Robert Brisita <986796+rbrisita@users.noreply.github.com>
+Robert Sung-wook Shin <edp1096@users.noreply.github.com>
+Robey Holderith <robey@flaminglunchbox.net>
+Robyn <robyngraf@users.noreply.github.com>
+Roger Meier <r.meier@siemens.com>
+Roland <14355895+rbur0425@users.noreply.github.com>
+Romain D <90720+Artefact2@users.noreply.github.com>
+Romain Neutron <romain@neutron.io>
+Roman Parykin <donderom@gmail.com>
+Ron Evans <ron@hybridgroup.com>
+Ron Jailall <rojailal@gmail.com>
+Ronny Brendel <ronnybrendel@gmail.com>
+Ronsor <ronsor@ronsor.pw>
+Rowan Hart <rowanbhart@gmail.com>
+Rune <43761327+Rune-AI@users.noreply.github.com>
+Ryan Landay <rlanday@gmail.com>
+Ryder Wishart <ryderwishart@gmail.com>
+Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
+SakuraUmi <yukinon244@gmail.com>
+Salvador E. Tropea <stropea@inti.gob.ar>
+Sam Spilsbury <smspillaz@gmail.com>
+Sami Farin <3876865+Safari77@users.noreply.github.com>
+Samuel Maynard <samwmaynard@gmail.com>
+Sang-Kil Park <sang.park@42dot.ai>
+Seb C <47074056+Sebby37@users.noreply.github.com>
+Sebastián A <sebastian.aedo29@gmail.com>
+SebastianApel <13675545+SebastianApel@users.noreply.github.com>
+Senemu <10880819+Senemu@users.noreply.github.com>
+Sergey Alirzaev <zl29ah@gmail.com>
+Sergio López <slp@sinrega.org>
+SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
+ShadovvBeast <ShadovvBeast@gmail.com>
+Shakhar Dasgupta <shakhardasgupta@gmail.com>
+Shangning Xu <32517059+xushangning@users.noreply.github.com>
+Shijie <821898965@qq.com>
+Shintarou Okada <kokuzen@gmail.com>
+Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
+Shouzheng Liu <lshzh.hi@gmail.com>
+Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
+Simon Willison <swillison@gmail.com>
+Siwen Yu <yusiwen@gmail.com>
+Sky Yan <skyan83@gmail.com>
+Slaren <2141330+slaren@users.noreply.github.com>
+Slava Primenko <primenko.s@gmail.com>
+SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com>
+Someone <sergei.kozlukov@aalto.fi>
+Someone Serge <sergei.kozlukov@aalto.fi>
+Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
+Spencer Sutton <spencersutton@users.noreply.github.com>
+Srinivas Billa <nivibilla@gmail.com>
+Stefan Sydow <stefan@sydow.email>
+Stephan Walter <stephan@walter.name>
+Stephen Nichols <snichols@users.noreply.github.com>
+Steve Grubb <ausearch.1@gmail.com>
+Steven Roussey <sroussey@gmail.com>
+Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
+Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
+SuperUserNameMan <yoann@terminajones.com>
+Tai Duc Nguyen <taiducnguyen.drexel@gmail.com>
+Taikono-Himazin <kazu@po.harenet.ne.jp>
+Tameem <113388789+AhmadTameem@users.noreply.github.com>
+Tamotsu Takahashi <ttakah+github@gmail.com>
+Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com>
+Thatcher Chamberlin <j.thatcher.c@gmail.com>
+Theia Vogel <theia@vgel.me>
+Thérence <13496987+Royalphax@users.noreply.github.com>
+Thibault Terrasson <thibault.terrasson@gmail.com>
+Thomas Klausner <wiz@gatalith.at>
+Tim Miller <drasticactions@users.noreply.github.com>
+Timmy Knight <r2d2fish@gmail.com>
+Timothy Cronin <40186632+4imothy@users.noreply.github.com>
+Ting Lou <ting.lou@gmail.com>
+Ting Sun <suntcrick@gmail.com>
+Tobias Lütke <tobi@shopify.com>
+Tom C <tom.corelis@gmail.com>
+Tom Jobbins <784313+TheBloke@users.noreply.github.com>
+Tomas <tom.tomas.36478119@gmail.com>
+Tomáš Pazdiora <tomas.pazdiora@gmail.com>
+Tristan Ross <rosscomputerguy@protonmail.com>
+Tungsten842 <886724vf@anonaddy.me>
+Tungsten842 <quantmint@protonmail.com>
+Tushar <ditsuke@protonmail.com>
+UEXTM.com <84163508+uextm@users.noreply.github.com>
+Uzo Nweke <uzoechi@gmail.com>
+Vaibhav Srivastav <vaibhavs10@gmail.com>
+Val Kharitonov <mail@kharvd.com>
+Valentin Konovalov <valle.ketsujin@gmail.com>
+Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
+Victor Z. Peng <ziliangdotme@gmail.com>
+Vlad <spitfireage@gmail.com>
+Vladimir <bogdad@gmail.com>
+Vladimir Malyutin <first-leon@yandex.ru>
+Vladimir Zorin <vladimir@deviant.guru>
+Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com>
+WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com>
+Weird Constructor <weirdconstructor@gmail.com>
+Welby Seely <welbyseely@gmail.com>
+Wentai Zhang <rchardx@gmail.com>
+WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
+Willy Tarreau <w@1wt.eu>
+Wu Jian Ping <wujjpp@hotmail.com>
+Wu Jian Ping <wujp@greatld.com>
+Xiake Sun <xiake.sun@intel.com>
+Xiang (Kevin) Li <kevinli020508@gmail.com>
+Xiao-Yong Jin <jinxiaoyong@gmail.com>
+XiaotaoChen <chenxiaotao1234@gmail.com>
+Xiaoyi Chen <cxychina@gmail.com>
+Xingchen Song(宋星辰) <xingchensong1996@163.com>
+Xuan Son Nguyen <thichthat@gmail.com>
+Yann Follet <131855179+YannFollet@users.noreply.github.com>
+Yiming Cui <conandiy@vip.qq.com>
+Yishuo Wang <MeouSker77@outlook.com>
+Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
+Yui <dev@sleepyyui.com>
+Yusuf Kağan Hanoğlu <hanoglu@yahoo.com>
+Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com>
+ZHAOKAI WANG <sanxianwei@163.com>
+Zane Shannon <z@zcs.me>
+Zay <95888118+isaiahbjork@users.noreply.github.com>
+Zenix <zenixls2@gmail.com>
+Zhang Peiyuan <a1286225768@gmail.com>
+ZhouYuChen <zhouyuchen@naver.com>
+Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
+Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
+Zsapi <martin1.zsapka@gmail.com>
+a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
+adel boussaken <netdur@gmail.com>
+afrideva <95653597+afrideva@users.noreply.github.com>
+akawrykow <142945436+akawrykow@users.noreply.github.com>
+alexpinel <93524949+alexpinel@users.noreply.github.com>
+alonfaraj <alonfaraj@gmail.com>
+andrijdavid <david@geek.mg>
+anon998 <131767832+anon998@users.noreply.github.com>
+anzz1 <anzz1@live.com>
+apaz <aarpazdera@gmail.com>
+apcameron <37645737+apcameron@users.noreply.github.com>
+arcrank <arcrank@gmail.com>
+arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
+at8u <129688334+at8u@users.noreply.github.com>
+automaticcat <daogiatuank54@gmail.com>
+bandoti <141645996+bandoti@users.noreply.github.com>
+beiller <beiller@gmail.com>
+bhubbb <79117352+bhubbb@users.noreply.github.com>
+bmwl <brian.marshall@tolko.com>
+bobqianic <129547291+bobqianic@users.noreply.github.com>
+bryanSwk <93190252+bryanSwk@users.noreply.github.com>
+bsilvereagle <bsilvereagle@users.noreply.github.com>
+bssrdf <merlintiger@hotmail.com>
+byte-6174 <88070277+byte-6174@users.noreply.github.com>
+cebtenzzre <cebtenzzre@gmail.com>
+chaihahaha <chai836275709@gmail.com>
+chiranko <96988916+chiranko@users.noreply.github.com>
+clibdev <52199778+clibdev@users.noreply.github.com>
+clyang <clyang@clyang.net>
+cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
+coezbek <c.oezbek@gmail.com>
+comex <comexk@gmail.com>
+compilade <113953597+compilade@users.noreply.github.com>
+crasm <crasm@git.vczf.net>
+crasm <crasm@git.vczf.us>
+daboe01 <daboe01@googlemail.com>
+david raistrick <keen99@users.noreply.github.com>
+ddpasa <112642920+ddpasa@users.noreply.github.com>
+deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
+divinity76 <divinity76@gmail.com>
+dotpy314 <33351922+dotpy314@users.noreply.github.com>
+drbh <david.richard.holtz@gmail.com>
+ds5t5 <145942675+ds5t5@users.noreply.github.com>
+dylan <canardleteer@users.noreply.github.com>
+eastriver <lee@eastriver.dev>
+ebraminio <ebraminio@gmail.com>
+eiery <19350831+eiery@users.noreply.github.com>
+eric8607242 <e0928021388@gmail.com>
+fraxy-v <65565042+fraxy-v@users.noreply.github.com>
+github-actions[bot] <github-actions[bot]@users.noreply.github.com>
+gliptic <gliptic@users.noreply.github.com>
+goerch <jhr.walter@t-online.de>
+grahameth <96447521+grahameth@users.noreply.github.com>
+gwjr <502526+gwjr@users.noreply.github.com>
+h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
+hankcs <cnhankmc@gmail.com>
+hoangmit <hoangmit@users.noreply.github.com>
+hongbo.mo <352280764@qq.com>
+howlger <eclipse@voormann.de>
+howlger <github@voormann.de>
+hutli <6594598+hutli@users.noreply.github.com>
+hutli <hutli@hutli.hu>
+hutli <jensstaermose@hotmail.com>
+hxer7963 <hxer7963@gmail.com>
+hydai <z54981220@gmail.com>
+iSma <ismail.senhaji@gmail.com>
+iacore <74560659+iacore@users.noreply.github.com>
+igarnier <igarnier@protonmail.com>
+iohub <rickyang.pro@gmail.com>
+jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
+jameswu2014 <545426914@qq.com>
+jneem <joeneeman@gmail.com>
+johnson442 <56517414+johnson442@users.noreply.github.com>
+jon-chuang <9093549+jon-chuang@users.noreply.github.com>
+jp-x-g <jpxg-dev@protonmail.com>
+jwj7140 <32943891+jwj7140@users.noreply.github.com>
+kaizau <kaizau@users.noreply.github.com>
+kalomaze <66376113+kalomaze@users.noreply.github.com>
+kang <tpdns9032100@gmail.com>
+katsu560 <118887472+katsu560@users.noreply.github.com>
+kchro3 <62481661+kchro3@users.noreply.github.com>
+khimaros <me@khimaros.com>
+kiltyj <kiltyj@gmail.com>
+klosax <131523366+klosax@users.noreply.github.com>
+kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
+kunnis <kunnis@users.noreply.github.com>
+kuronekosaiko <EvanChanJ@163.com>
+kuvaus <22169537+kuvaus@users.noreply.github.com>
+kwin1412 <42286931+kwin1412@users.noreply.github.com>
+l3utterfly <gc.pthzfoldr@gmail.com>
+ldwang <ftgreat@163.com>
+le.chang <cljs118@126.com>
+leejet <leejet714@gmail.com>
+limitedAtonement <limitedAtonement@users.noreply.github.com>
+lon <114724657+longregen@users.noreply.github.com>
+m3ndax <adrian.goessl@outlook.com>
+maddes8cht <55592906+maddes8cht@users.noreply.github.com>
+makomk <makosoft@googlemail.com>
+manikbhandari <mbbhandarimanik2@gmail.com>
+mdrokz <mohammadmunshi@gmail.com>
+mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
+minarchist <minarchist@users.noreply.github.com>
+mj-shifu <77107165+mj-shifu@users.noreply.github.com>
+mmyjona <jonathan.gonse@gmail.com>
+momonga <115213907+mmnga@users.noreply.github.com>
+moritzbrantner <31051084+moritzbrantner@users.noreply.github.com>
+mzcu <milos.cubrilo@gmail.com>
+nanahi <130121847+na-na-hi@users.noreply.github.com>
+ngc92 <7938269+ngc92@users.noreply.github.com>
+nhamanasu <45545786+nhamanasu@users.noreply.github.com>
+niansa/tuxifan <anton-sa@web.de>
+niansa/tuxifan <tuxifan@posteo.de>
+ningshanwutuobang <ningshanwutuobang@gmail.com>
+nold <Nold360@users.noreply.github.com>
+nopperl <54780682+nopperl@users.noreply.github.com>
+nusu-github <29514220+nusu-github@users.noreply.github.com>
+olexiyb <olexiyb@gmail.com>
+oobabooga <112222186+oobabooga@users.noreply.github.com>
+opparco <parco.opaai@gmail.com>
+ostix360 <55257054+ostix360@users.noreply.github.com>
+perserk <perserk@gmail.com>
+postmasters <namnguyen@google.com>
+pudepiedj <pudepiedj@gmail.com>
+qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
+qouoq <qouoq@fastmail.com>
+qunash <anzoria@gmail.com>
+rabidcopy <rabidcopy@yahoo.com>
+rankaiyx <rankaiyx@rankaiyx.com>
+rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com>
+rhuddleston <ryan.huddleston@percona.com>
+rimoliga <53384203+rimoliga@users.noreply.github.com>
+runfuture <runfuture@users.noreply.github.com>
+sandyiscool <sandyiscool@gmail.com>
+semidark <me@semidark.net>
+sharpHL <132747147+sharpHL@users.noreply.github.com>
+shibe2 <shibe@tuta.io>
+singularity <12184989+singularity-s0@users.noreply.github.com>
+sjinzh <sjinzh@gmail.com>
+slaren <2141330+slaren@users.noreply.github.com>
+slaren <slarengh@gmail.com>
+snadampal <87143774+snadampal@users.noreply.github.com>
+staviq <staviq@gmail.com>
+stduhpf <stephduh@live.fr>
+swittk <switt1995@gmail.com>
+takov751 <40316768+takov751@users.noreply.github.com>
+tarcey <cey.tarik@gmail.com>
+texmex76 <40733439+texmex76@users.noreply.github.com>
+thement <40525767+thement@users.noreply.github.com>
+tjohnman <tjohnman@users.noreply.github.com>
+tslmy <tslmy@users.noreply.github.com>
+ubik2 <ubik2@users.noreply.github.com>
+uint256_t <konndennsa@gmail.com>
+uint256_t <maekawatoshiki1017@gmail.com>
+unbounded <haakon@likedan.net>
+valiray <133289098+valiray@users.noreply.github.com>
+vodkaslime <646329483@qq.com>
+vvhg1 <94630311+vvhg1@users.noreply.github.com>
+vxiiduu <73044267+vxiiduu@users.noreply.github.com>
+wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
+whoreson <139810751+whoreson@users.noreply.github.com>
+wonjun Jang <strutive07@gmail.com>
+wzy <32936898+Freed-Wu@users.noreply.github.com>
+xaedes <xaedes@gmail.com>
+xaedes <xaedes@googlemail.com>
+xloem <0xloem@gmail.com>
+yangli2 <yangli2@gmail.com>
+yuiseki <yuiseki@gmail.com>
+zakkor <edward.partenie@gmail.com>
+zhouwg <6889919+zhouwg@users.noreply.github.com>
+zrm <trustiosity.zrm@gmail.com>
+源文雨 <41315874+fumiama@users.noreply.github.com>
+Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,8 @@ else()
    set(LLAMA_METAL_DEFAULT OFF)
 endif()

+set(LLAMA_LLAMAFILE_DEFAULT ON)
+
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
@@ -88,6 +90,7 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
@@ -286,6 +289,7 @@ if (LLAMA_METAL)
        ${METALKIT_FRAMEWORK}
        )
 endif()
+
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -368,6 +372,13 @@ if (LLAMA_BLAS)
    endif()
 endif()

+if (LLAMA_LLAMAFILE)
+    add_compile_definitions(GGML_USE_LLAMAFILE)
+
+    set(GGML_HEADERS_LLAMAFILE sgemm.h)
+    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+endif()
+
 if (LLAMA_QKK_64)
    add_compile_definitions(GGML_QKK_64)
 endif()
@@ -1151,15 +1162,16 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA}    ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL}  ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}   ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
-            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
-            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
-            ${GGML_SOURCES_ROCM}    ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
+            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
+            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
+            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
+            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Georgi Gerganov
+Copyright (c) 2023-2024 The ggml authors

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/98
+++ b/98
@@ -1,16 +1,28 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
-	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
-	tests/test-json-schema-to-grammar
+	tests/test-autorelease \
+	tests/test-backend-ops \
+	tests/test-double-float \
+	tests/test-grad0 \
+	tests/test-grammar-integration \
+	tests/test-grammar-parser \
+	tests/test-json-schema-to-grammar \
+	tests/test-llama-grammar \
+	tests/test-model-load-cancel \
+	tests/test-opt \
+	tests/test-quantize-fns \
+	tests/test-quantize-perf \
+	tests/test-rope \
+	tests/test-sampling \
+	tests/test-tokenizer-0 \
+	tests/test-tokenizer-1-bpe \
+	tests/test-tokenizer-1-spm

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+# In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
+# of non-gcc compilers don't have to provide g++ alias or wrapper.
+DEFCC  := cc
+DEFCXX := c++
+ifeq ($(origin CC),default)
+CC  := $(DEFCC)
+endif
+ifeq ($(origin CXX),default)
+CXX := $(DEFCXX)
+endif
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
@@ -384,6 +413,11 @@ ifdef LLAMA_OPENBLAS
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS

+ifndef LLAMA_NO_LLAMAFILE
+	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+	OBJS        += sgemm.o
+endif
+
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
@@ -480,11 +514,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
-
 endif # LLAMA_CUDA

 ifdef LLAMA_CLBLAST
-
 	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -603,6 +635,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

+ifndef LLAMA_NO_LLAMAFILE
+sgemm.o: sgemm.cpp sgemm.h ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
 GF_CC := $(CC)
 include scripts/get-flags.mk

@@ -646,7 +683,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -687,8 +724,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o

 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -756,11 +793,11 @@ batched: examples/batched/batched.cpp                         ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -788,10 +825,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% Makefile
+	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+		echo "unsigned char $${NAME}[] = {" && \
+		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+		echo "};" && \
+		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+	) > $@
+
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -800,6 +846,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -918,6 +968,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -946,11 +1000,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -958,7 +1008,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

--- a/Package.swift
+++ b/Package.swift
@@ -2,6 +2,45 @@

 import PackageDescription

+var sources = [
+    "ggml.c",
+    "sgemm.cpp",
+    "llama.cpp",
+    "unicode.cpp",
+    "unicode-data.cpp",
+    "ggml-alloc.c",
+    "ggml-backend.c",
+    "ggml-quants.c",
+]
+
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] =  [
+    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+    .unsafeFlags(["-fno-objc-arc"]),
+    // NOTE: NEW_LAPACK will required iOS version 16.4+
+    // We should consider add this in the future when we drop support for iOS 14
+    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+    // .define("ACCELERATE_NEW_LAPACK"),
+    // .define("ACCELERATE_LAPACK_ILP64")
+]
+
+#if canImport(Darwin)
+sources.append("ggml-metal.m")
+resources.append(.process("ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+    contentsOf: [
+        .define("GGML_USE_ACCELERATE"),
+        .define("GGML_USE_METAL")
+    ]
+)
+#endif
+
+#if os(Linux)
+    cSettings.append(.define("_GNU_SOURCE"))
+#endif
+
 let package = Package(
    name: "llama",
    platforms: [
@@ -28,34 +67,11 @@ let package = Package(
               "ggml-cuda.h",
               "Makefile"
            ],
-            sources: [
-                "ggml.c",
-                "llama.cpp",
-                "unicode.cpp",
-                "unicode-data.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m",
-            ],
-            resources: [
-                .process("ggml-metal.metal")
-            ],
+            sources: sources,
+            resources: resources,
            publicHeadersPath: "spm-headers",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
-            linkerSettings: [
-                .linkedFramework("Accelerate")
-            ]
+            cSettings: cSettings,
+            linkerSettings: linkerSettings
        )
    ],
    cxxLanguageStandard: .cxx11
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -3,14 +3,14 @@
 - [Background](#background)
 - [News](#news)
 - [OS](#os)
- [Supported Devices](#supported-devices)
+- [Hardware](#hardware)
 - [Docker](#docker)
 - [Linux](#linux)
 - [Windows](#windows)
 - [Environment Variable](#environment-variable)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
+- [Known Issue](#known-issues)
+- [Q&A](#qa)
+- [TODO](#todo)

 ## Background

@@ -24,19 +24,20 @@
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

 ### Llama.cpp + SYCL
-This SYCL "backend" follows the same design found in other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. The oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

-The llama.cpp SYCL backend supports:
- Intel GPUs.
- Nvidia GPUs.
+The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).

-*Upcoming support: AMD GPUs*.
+When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.

-When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64](README.md#intel-onemkl) approach.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.

 ## News

+- 2024.4
+  - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
+
 - 2024.3
+  - Release binary files of Windows.
  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
@@ -53,57 +54,48 @@ When targetting **Intel CPUs**, it is recommended to  use llama.cpp for [x86_64]

 ## OS

-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
-|Windows|Support|Windows 11|
+| OS      | Status  | Verified                           |
+|---------|---------|------------------------------------|
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11                         |


-## Supported devices
+## Hardware

-### Intel GPUs
+### Intel GPU

-The oneAPI Math Kernel Library, which the oneAPI base-toolkit includes, supports intel GPUs. In order to make it "visible", simply run the following:
-```sh
-source /opt/intel/oneapi/setvars.sh
-```
+**Verified devices**

- **Tested devices**
-
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770, 730M|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
+| Intel GPU                     | Status  | Verified Model                        |
+|-------------------------------|---------|---------------------------------------|
+| Intel Data Center Max Series  | Support | Max 1550, 1100                        |
+| Intel Data Center Flex Series | Support | Flex 170                              |
+| Intel Arc Series              | Support | Arc 770, 730M                         |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
+| Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

- Device memory can be a limitation when running a large model on an intel GPU. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
+- **Memory**
+  - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.

- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPUs and 4.0GB for discrete GPUs.
+  - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.

- If the iGPU has less than 80  EUs *(Execution Unit)*, the inference speed will likely be too slow for practical use.
+- **Execution Unit (EU)**
+  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.

-### Nvidia GPUs
-The BLAS acceleration on Nvidia GPUs through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
+### Other Vendor GPU

- **Tested devices**
-
-|Nvidia GPU| Status | Verified Model|
-|-|-|-|
-|Ampere Series| Support| A100, A4000|
-|Ampere Series *(Mobile)*| Support| RTX 40 Series|
-
-*Notes:*
-  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
-
-  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
+**Verified devices**

+| Nvidia GPU               | Status  | Verified Model |
+|--------------------------|---------|----------------|
+| Ampere Series            | Support | A100, A4000    |
+| Ampere Series *(Mobile)* | Support | RTX 40 Series  |

 ## Docker
 The docker build option is currently limited to *intel GPU* targets.
+
 ### Build image
 ```sh
 # Using FP16
@@ -169,30 +161,11 @@ Platform #0: Intel(R) OpenCL HD Graphics

 - **Nvidia GPU**

-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
-Installation can be verified by running the following:
-```sh
-nvidia-smi
-```
-Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
-```
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
-| N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
-|                                         |                      |             Disabled |
-+-----------------------------------------+----------------------+----------------------+
-```
-
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.

 2. **Install Intel® oneAPI Base toolkit**

- **Base installation**
+- **For Intel GPU**

 The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

@@ -204,17 +177,16 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 - **Adding support to Nvidia GPUs**

-**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.


-**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.

 ```sh
 git clone https://github.com/oneapi-src/oneMKL
 cd oneMKL
-mkdir -p buildWithCublas && cd buildWithCublas
-cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-make
+cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
+cmake --build buildWithCublas --config Release
 ```


@@ -239,7 +211,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic

 - **Nvidia GPU**

-Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
 [opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -254,13 +226,15 @@ Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA devi
 source /opt/intel/oneapi/setvars.sh

 # Build LLAMA with MKL BLAS acceleration for intel GPU
-mkdir -p build && cd build

-# Option 1: Use FP16 for better performance in long-prompt  inference
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

-# Option 2: Use FP32 by default
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+# Option 2: Use FP16
+cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+# build all binary
+cmake --build build --config Release -j -v
 ```

 #### Nvidia GPU
@@ -272,13 +246,16 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR

 # Build LLAMA with Nvidia BLAS acceleration through SYCL
-mkdir -p build && cd build

-# Option 1: Use FP16 for better performance in long-prompt  inference
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+# build all binary
+cmake --build build --config Release -j -v

-# Option 2: Use FP32 by default
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

 ### III. Run the inference
@@ -314,10 +291,10 @@ found 6 SYCL devices:
 | 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```

-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero driver/runtime, recommended |
-|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
+| Attribute              | Note                                                        |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |

 4. Launch inference

@@ -326,10 +303,10 @@ There are two device selection modes:
 - Single device: Use one device target specified by the user.
 - Multiple devices: Automatically select the devices with the same largest Max compute-units.

-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter                              |
+|------------------|----------------------------------------|
+| Single device    | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default)           |

 Examples:

@@ -358,7 +335,6 @@ Otherwise, you can run the script:

 *Notes:*

- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

 ```sh
@@ -433,13 +409,15 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit
 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

 ```
-mkdir -p build
-cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

-cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release

-make
+# Option 2: Or FP16
+cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
+
+cmake --build build --config Release -j
 ```

 Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -487,10 +465,10 @@ found 6 SYCL devices:

 ```

-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+| Attribute              | Note                                                      |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended                      |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |


 4. Launch inference
@@ -500,10 +478,10 @@ There are two device selection modes:
 - Single device: Use one device assigned by user.
 - Multiple devices: Automatically choose the devices with the same biggest Max compute units.

-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter                              |
+|------------------|----------------------------------------|
+| Single device    | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default)           |

 Examples:

@@ -526,7 +504,6 @@ Otherwise, run the following wrapper script:

 Note:

- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

 ```sh
@@ -541,29 +518,23 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 #### Build

-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
-|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
-|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
-|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
-|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
+| Name               | Value                             | Function                                    |
+|--------------------|-----------------------------------|---------------------------------------------|
+| LLAMA_SYCL         | ON (mandatory)                    | Enable build with SYCL code path.           |
+| LLAMA_SYCL_TARGET  | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| LLAMA_SYCL_F16     | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

 #### Runtime

-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
+| Name              | Value            | Function                                                                                                                  |
+|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
+| GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

 ## Known Issues

- Hanging during startup
-
-  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
-
-  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
-
 - `Split-mode:[row]` is not supported.

 ## Q&A
@@ -575,7 +546,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 - General compiler error:

-  - Remove build folder or try a clean-build.
+  - Remove **build** folder or try a clean-build.

 - I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.

@@ -592,6 +563,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 ### **GitHub contribution**:
 Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

-## Todo
+## TODO

 - Support row layer split for multiple card runs.
--- a/README.md
+++ b/README.md
@@ -10,6 +10,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

+- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
+- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
@@ -18,10 +20,11 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
+- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
+- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
+- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
@@ -91,9 +94,11 @@ Typically finetunes of the base models below are supported as well.

 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
+- [x] LLaMA 3 🦙🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [X] Falcon
+- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -116,9 +121,14 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
 - [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
+- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
+- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+- [x] [OLMo](https://allenai.org/olmo)
+
+(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

 **Multimodal models:**

@@ -128,6 +138,8 @@ Typically finetunes of the base models below are supported as well.
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
+- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
+- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)

 **HTTP server**

@@ -180,6 +192,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [Msty](https://msty.app) (proprietary)
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
+- [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [MindMac](https://mindmac.app) (proprietary)
+- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
+- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)

 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

@@ -291,6 +308,8 @@ In order to build llama.cpp you have three different options.
      make
      ```

+      **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
+
  - On Windows:

    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@@ -305,12 +324,26 @@ In order to build llama.cpp you have three different options.
 - Using `CMake`:

    ```bash
-    mkdir build
-    cd build
-    cmake ..
-    cmake --build . --config Release
+    cmake -B build
+    cmake --build build --config Release
    ```

+    **Note**: for `Debug` builds, there are two cases:
+
+    - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+
+      ```bash
+      cmake -B build -DCMAKE_BUILD_TYPE=Debug
+      cmake --build build
+      ```
+
+    - Multi-config generators (`-G` param set to Visual Studio, XCode...):
+
+      ```bash
+      cmake -B build -G "Xcode"
+      cmake --build build --config Debug
+      ```
+
 - Using `Zig` (version 0.11 or later):

    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
@@ -422,10 +455,8 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake` on Linux:

      ```bash
-      mkdir build
-      cd build
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-      cmake --build . --config Release
+      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+      cmake --build build --config Release
      ```

 - #### BLIS
@@ -445,11 +476,9 @@ Building the program with BLAS support may lead to some performance improvements
  - Using manual oneAPI installation:
    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
      ```bash
-      mkdir build
-      cd build
      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build . --config Release
+      cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
+      cmake --build build --config Release
      ```

  - Using oneAPI docker image:
@@ -470,28 +499,26 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake`:

    ```bash
-    mkdir build
-    cd build
-    cmake .. -DLLAMA_CUDA=ON
-    cmake --build . --config Release
+    cmake -B build -DLLAMA_CUDA=ON
+    cmake --build build --config Release
    ```

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

-  | Option                         | Legal values           | Default | Description |
-  |--------------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
-  | LLAMA_CUDA_F16                 | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
-  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       |     128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+  | Option                         | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
+  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
+  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
+  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
+  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |

 - #### hipBLAS

  This provides BLAS acceleration on HIP-supported AMD GPUs.
  Make sure to have ROCm installed.
-  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick).

  - Using `make`:
    ```bash
@@ -500,15 +527,15 @@ Building the program with BLAS support may lead to some performance improvements
  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
    ```bash
    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-        cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build -- -j 16
+        cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+        && cmake --build build --config Release -- -j 16
    ```
    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).

  - Using `make` (example for target gfx1030, build with 16 CPU threads):
    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
+    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
    ```

  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
@@ -516,7 +543,7 @@ Building the program with BLAS support may lead to some performance improvements
    set PATH=%HIP_PATH%\bin;%PATH%
    mkdir build
    cd build
-    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
    cmake --build .
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@@ -527,18 +554,18 @@ Building the program with BLAS support may lead to some performance improvements
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):

-  | Option                  | Legal values           | Default | Description |
-  |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+  | Option                  | Legal values           | Default | Description                                                                                                                                                                                                                                    |
+  |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |

 - #### CLBlast

  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
-    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+    - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.

    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.

@@ -547,15 +574,14 @@ Building the program with BLAS support may lead to some performance improvements

        ```sh
        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
-        mkdir OpenCL-SDK/build
-        cd OpenCL-SDK/build
-        cmake .. -DBUILD_DOCS=OFF \
+        cd OpenCL-SDK
+        cmake -B build -DBUILD_DOCS=OFF \
          -DBUILD_EXAMPLES=OFF \
          -DBUILD_TESTING=OFF \
          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
          -DOPENCL_SDK_TEST_SAMPLES=OFF
-        cmake --build . --config Release
-        cmake --install . --prefix /some/path
+        cmake --build build
+        cmake --install build --prefix /some/path
        ```
      </details>

@@ -563,6 +589,12 @@ Building the program with BLAS support may lead to some performance improvements

  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.

+  Linux packaging:
+  Fedora Linux:
+  ```bash
+  sudo dnf install clblast
+  ```
+
  Alternatively, they may be built from source.

  - <details>
@@ -571,23 +603,23 @@ Building the program with BLAS support may lead to some performance improvements
      ```cmd
      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
      git clone https://github.com/CNugteren/CLBlast.git
-      mkdir CLBlast\build
-      cd CLBlast\build
-      cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
-      cmake --build . --config Release
-      cmake --install . --prefix C:/CLBlast
+      cd CLBlast
+      cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
+      cmake --build build --config Release
+      cmake --install build --prefix C:/CLBlast
      ```

+      (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
+
  - <details>
    <summary>Unix:</summary>

      ```sh
      git clone https://github.com/CNugteren/CLBlast.git
-      mkdir CLBlast/build
-      cd CLBlast/build
-      cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
-      cmake --build . --config Release
-      cmake --install . --prefix /some/path
+      cd CLBlast
+      cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+      cmake --build build --config Release
+      cmake --install build --prefix /some/path
      ```

      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
@@ -601,21 +633,17 @@ Building the program with BLAS support may lead to some performance improvements
    ```
  - CMake (Unix):
    ```sh
-    mkdir build
-    cd build
-    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
-    cmake --build . --config Release
+    cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+    cmake --build build --config Release
    ```
  - CMake (Windows):
    ```cmd
    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
    git clone https://github.com/ggerganov/llama.cpp
    cd llama.cpp
-    mkdir build
-    cd build
-    cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
-    cmake --build . --config Release
-    cmake --install . --prefix C:/LlamaCPP
+    cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
+    cmake --build build --config Release
+    cmake --install build --prefix C:/LlamaCPP
    ```

  ##### Running Llama with CLBlast
@@ -671,10 +699,8 @@ Building the program with BLAS support may lead to some performance improvements
  Then, build llama.cpp using the cmake command below:

  ```bash
-  mkdir -p build
-  cd build
-  cmake .. -DLLAMA_VULKAN=1
-  cmake --build . --config Release
+  cmake -B build -DLLAMA_VULKAN=1
+  cmake --build build --config Release
  # Test the output binary (with "-ngl 33" to offload all layers to GPU)
  ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4

@@ -739,11 +765,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.

 | Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-|    7B |         13 GB |                 3.9 GB |
-|   13B |         24 GB |                 7.8 GB |
-|   30B |         60 GB |                19.5 GB |
-|   65B |        120 GB |                38.5 GB |
+|------:|--------------:|----------------------:|
+|    7B |         13 GB |                3.9 GB |
+|   13B |         24 GB |                7.8 GB |
+|   30B |         60 GB |               19.5 GB |
+|   65B |        120 GB |               38.5 GB |

 ### Quantization

@@ -751,7 +777,7 @@ Several quantization methods are supported. They differ in the resulting model d

 *(outdated)*

-| Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
+| Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
 |    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
@@ -1099,7 +1125,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+
+![matmul](media/matmul.png)

 ### Docs

--- a/SECURITY.md
+++ b/SECURITY.md
@@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your

 1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.

-1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.

-1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.

-1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.

 ## Reporting a vulnerability

--- a/build.zig
+++ b/build.zig
@@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;

    const ggml = make.obj("ggml", "ggml.c");
+    const sgemm = make.obj("sgemm", "sgemm.cpp");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@@ -128,15 +129,44 @@ pub fn build(b: *std.build.Builder) !void {
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
+
+    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+    for (server_assets) |asset| {
+        const input_path = b.fmt("examples/server/public/{s}", .{asset});
+        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+        defer b.allocator.free(input);
+
+        var buf = std.ArrayList(u8).init(b.allocator);
+        defer buf.deinit();
+
+        for (input) |byte| {
+            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+        }
+
+        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+        defer b.allocator.free(name);
+        std.mem.replaceScalar(u8, name, '.', '_');
+
+        try std.fs.cwd().writeFile(output_path, b.fmt(
+            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+            .{ name, buf.items, name, input.len },
+        ));
+
+        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+    }
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -153,6 +153,55 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

+# test_scripts_debug
+
+function gg_run_test_scripts_debug {
+    cd ${SRC}
+
+    set -e
+
+    # TODO: too slow, run on dedicated node
+   #(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+   #(cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# test_scripts_release
+
+function gg_run_test_scripts_release {
+    cd ${SRC}
+
+    set -e
+
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
 function gg_get_model {
    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@@ -287,7 +336,8 @@ function gg_run_open_llama_3b_v2 {

    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -468,7 +518,10 @@ function gg_run_open_llama_7b_v2 {

    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log

-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log

    function check_ppl {
        qnt="$1"
@@ -642,6 +695,9 @@ test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small

+    test $ret -eq 0 && gg_run test_scripts_debug
+    test $ret -eq 0 && gg_run test_scripts_release
+
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-set(TARGET json-schema-to-grammar)
-add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
-
 set(TARGET common)

 add_library(${TARGET} STATIC
@@ -63,6 +60,7 @@ add_library(${TARGET} STATIC
    grammar-parser.h
    grammar-parser.cpp
    json.hpp
+    json-schema-to-grammar.cpp
    train.h
    train.cpp
    ngram-cache.h
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,4 +1,6 @@
 #include "common.h"
+#include "json.hpp"
+#include "json-schema-to-grammar.h"
 #include "llama.h"

 #include <algorithm>
@@ -16,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
+#include <codecvt>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -27,7 +30,6 @@
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
-#include <codecvt>
 #include <locale>
 #include <windows.h>
 #include <fcntl.h>
@@ -65,9 +67,10 @@
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-#define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL

+using json = nlohmann::ordered_json;
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@@ -104,6 +107,79 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#include <pthread.h>
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(cpu, &mask);
+    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+    int intel_atom = 0x20;
+    int core_type = (eax & 0xff000000u) >> 24;
+    return core_type == intel_atom;
+}
+
+static int count_math_cpus(int cpu_count) {
+    int result = 0;
+    for (int cpu = 0; cpu < cpu_count; ++cpu) {
+        if (pin_cpu(cpu)) {
+            return -1;
+        }
+        if (is_running_on_efficiency_core()) {
+            continue; // efficiency cores harm lockstep threading
+        }
+        ++cpu; // hyperthreading isn't useful for linear algebra
+        ++result;
+    }
+    return result;
+}
+
+#endif // __x86_64__ && __linux__
+
+/**
+ * Returns number of CPUs on system that are useful for math.
+ */
+int get_math_cpu_count() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
+    if (cpu_count < 1) {
+        return get_num_physical_cores();
+    }
+    if (is_hybrid_cpu()) {
+        cpu_set_t affinity;
+        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+            int result = count_math_cpus(cpu_count);
+            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+            if (result > 0) {
+                return result;
+            }
+        }
+    }
+#endif
+    return get_num_physical_cores();
+}
+
 void process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@@ -157,15 +233,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }

+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char * sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.val_i64 = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.val_f64 = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.val_bool = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.val_bool = false;
+        } else {
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+    } else if (strncmp(sep, "str:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        if (strlen(sep) > 127) {
+            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            return false;
+        }
+        strncpy(kvo.val_str, sep, 127);
+        kvo.val_str[127] = '\0';
+    } else {
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    llama_sampling_params& sparams = params.sparams;
+    llama_sampling_params & sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
+        // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
        params.seed = std::stoul(argv[i]);
+        sparams.seed = std::stoul(argv[i]);
        return true;
    }
    if (arg == "-t" || arg == "--threads") {
@@ -768,7 +892,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-        params.image = argv[i];
+        params.image.emplace_back(argv[i]);
        return true;
    }
    if (arg == "-i" || arg == "--interactive") {
@@ -823,6 +947,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.cont_batching = true;
        return true;
    }
+    if (arg == "-fa" || arg == "--flash-attn") {
+        params.flash_attn = true;
+        return true;
+    }
    if (arg == "--color") {
        params.use_color = true;
        return true;
@@ -1010,6 +1138,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_print = std::stoi(argv[i]);
        return true;
    }
+    if (arg == "--check-tensors") {
+        params.check_tensors = true;
+        return true;
+    }
    if (arg == "--ppl-output-type") {
        if (++i >= argc) {
            invalid_param = true;
@@ -1148,52 +1280,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        );
        return true;
    }
+    if (arg == "-j" || arg == "--json-schema") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
+        return true;
+    }
    if (arg == "--override-kv") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
-        char* sep = strchr(argv[i], '=');
-        if (sep == nullptr || sep - argv[i] >= 128) {
-            fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        struct llama_model_kv_override kvo;
-        std::strncpy(kvo.key, argv[i], sep - argv[i]);
-        kvo.key[sep - argv[i]] = 0;
-        sep++;
-        if (strncmp(sep, "int:", 4) == 0) {
-            sep += 4;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.int_value = std::atol(sep);
-        }
-        else if (strncmp(sep, "float:", 6) == 0) {
-            sep += 6;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-            kvo.float_value = std::atof(sep);
-        }
-        else if (strncmp(sep, "bool:", 5) == 0) {
-            sep += 5;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-            if (std::strcmp(sep, "true") == 0) {
-                kvo.bool_value = true;
-            }
-            else if (std::strcmp(sep, "false") == 0) {
-                kvo.bool_value = false;
-            }
-            else {
-                fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                return true;
-            }
-        }
-        else {
+        if (!parse_kv_override(argv[i], params.kv_overrides)) {
            fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
            invalid_param = true;
            return true;
        }
-        params.kv_overrides.push_back(kvo);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
@@ -1223,6 +1327,29 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    return false;
 }

+void gpt_params_handle_model_default(gpt_params & params) {
+    if (!params.hf_repo.empty()) {
+        // short-hand to avoid specifying --hf-file -> default it to --model
+        if (params.hf_file.empty()) {
+            if (params.model.empty()) {
+                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
+            }
+            params.hf_file = params.model;
+        } else if (params.model.empty()) {
+            params.model = "models/" + string_split(params.hf_file, '/').back();
+        }
+    } else if (!params.model_url.empty()) {
+        if (params.model.empty()) {
+            auto f = string_split(params.model_url, '#').front();
+            f = string_split(f, '?').front();
+            f = string_split(f, '/').back();
+            params.model =  "models/" + f;
+        }
+    } else if (params.model.empty()) {
+        params.model = DEFAULT_MODEL_PATH;
+    }
+}
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
    std::string arg;
@@ -1251,10 +1378,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // short-hand to avoid specifying --hf-file -> default it to --model
-    if (!params.hf_repo.empty() && params.hf_file.empty()) {
-        params.hf_file = params.model;
-    }
+    gpt_params_handle_model_default(params);

    if (params.escape) {
        process_escapes(params.prompt);
@@ -1353,6 +1477,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    printf("  --grammar-file FNAME  file to read grammar from\n");
+    printf("  -j SCHEMA, --json-schema SCHEMA\n");
+    printf("                        JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
+    printf("                        For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
    printf("  --cfg-negative-prompt PROMPT\n");
    printf("                        negative prompt to use for guidance. (default: empty)\n");
    printf("  --cfg-negative-prompt-file FNAME\n");
@@ -1390,8 +1517,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -fa, --flash-attn     enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
-    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models. Specify multiple times for batching\n");
    if (llama_supports_mlock()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@@ -1444,7 +1572,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --control-vector-layer-range START END\n");
    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
+    printf("                        model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
    printf("  -md FNAME, --model-draft FNAME\n");
    printf("                        draft model for speculative decoding (default: unused)\n");
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
@@ -1461,9 +1589,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -ptc N, --print-token-count N\n");
    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
+    printf("  --check-tensors       check model tensor data for invalid values\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@@ -1500,6 +1629,77 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }

+// Validate if a filename is safe to use
+// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
+bool validate_file_name(const std::string & filename) {
+    if (!filename.length()) {
+        // Empty filename invalid
+        return false;
+    }
+    if (filename.length() > 255) {
+        // Limit at common largest possible filename on Linux filesystems
+        // to avoid unnecessary further validation
+        // (On systems with smaller limits it will be caught by the OS)
+        return false;
+    }
+
+    std::u32string filename_utf32;
+    try {
+        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
+        filename_utf32 = converter.from_bytes(filename);
+
+        // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
+        // or invalid encodings were encountered. Reject such attempts
+        std::string filename_reencoded = converter.to_bytes(filename_utf32);
+        if (filename_reencoded != filename) {
+            return false;
+        }
+    } catch (const std::exception &) {
+        return false;
+    }
+
+    // Check for forbidden codepoints:
+    // - Control characters
+    // - Unicode equivalents of illegal characters
+    // - UTF-16 surrogate pairs
+    // - UTF-8 replacement character
+    // - Byte order mark (BOM)
+    // - Illegal characters: / \ : * ? " < > |
+    for (char32_t c : filename_utf32) {
+        if (c <= 0x1F // Control characters (C0)
+            || c == 0x7F // Control characters (DEL)
+            || (c >= 0x80 && c <= 0x9F) // Control characters (C1)
+            || c == 0xFF0E // Fullwidth Full Stop (period equivalent)
+            || c == 0x2215 // Division Slash (forward slash equivalent)
+            || c == 0x2216 // Set Minus (backslash equivalent)
+            || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs
+            || c == 0xFFFD // Replacement Character (UTF-8)
+            || c == 0xFEFF // Byte Order Mark (BOM)
+            || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters
+            || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') {
+            return false;
+        }
+    }
+
+    // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename
+    // Unicode and other whitespace is not affected, only 0x20 space
+    if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') {
+        return false;
+    }
+
+    // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead)
+    if (filename.find("..") != std::string::npos) {
+        return false;
+    }
+
+    // Reject "."
+    if (filename == ".") {
+        return false;
+    }
+
+    return true;
+}
+
 //
 // String utils
 //
@@ -1517,6 +1717,18 @@ std::vector<std::string> string_split(std::string input, char separator) {
    return parts;
 }

+std::string string_strip(const std::string & str) {
+    size_t start = 0;
+    size_t end = str.size();
+    while (start < end && std::isspace(str[start])) {
+        start++;
+    }
+    while (end > start && std::isspace(str[end - 1])) {
+        end--;
+    }
+    return str.substr(start, end - start);
+}
+
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
        {"top_k",       llama_sampler_type::TOP_K},
@@ -1613,6 +1825,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
+    mparams.check_tensors   = params.check_tensors;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -1674,7 +1887,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
    cparams.defrag_thold      = params.defrag_thold;
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
    cparams.offload_kqv       = !params.no_kv_offload;
+    cparams.flash_attn        = params.flash_attn;

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -1705,59 +1921,75 @@ void llama_batch_add(

 #ifdef LLAMA_USE_CURL

-static bool llama_download_file(CURL * curl, const char * url, const char * path) {
+static bool starts_with(const std::string & str, const std::string & prefix) {
+    // While we wait for C++20's std::string::starts_with...
+    return str.rfind(prefix, 0) == 0;
+}
+
+static bool llama_download_file(const std::string & url, const std::string & path) {
+
+    // Initialize libcurl
+    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
+    if (!curl) {
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
    bool force_download = false;

    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, url);
-    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);

 #if defined(_WIN32)
    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
 #endif

    // Check if the file already exists locally
    struct stat model_file_info;
-    auto file_exists = (stat(path, &model_file_info) == 0);
+    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);

-    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
-    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char etag_path[PATH_MAX] = {0};
-    snprintf(etag_path, sizeof(etag_path), "%s.etag", path);
-
-    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-    char last_modified_path[PATH_MAX] = {0};
-    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path);
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata;
+    std::string etag;
+    std::string last_modified;

    if (file_exists) {
-        auto * f_etag = fopen(etag_path, "r");
-        if (f_etag) {
-            if (!fgets(etag, sizeof(etag), f_etag)) {
-                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
-            } else {
-                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, etag_path, etag);
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata["url"].is_string()) {
+                    auto previous_url = metadata["url"].get<std::string>();
+                    if (previous_url != url) {
+                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
+                if (metadata.contains("etag") && metadata["etag"].is_string()) {
+                    etag = metadata["etag"];
+                }
+                if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
+                    last_modified = metadata["lastModified"];
+                }
+            } catch (const nlohmann::json::exception & e) {
+                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
            }
-            fclose(f_etag);
-        }
-
-        auto * f_last_modified = fopen(last_modified_path, "r");
-        if (f_last_modified) {
-            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
-                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
-            } else {
-                fprintf(stderr, "%s: previous file found %s: %s\n", __func__, last_modified_path,
-                        last_modified);
-            }
-            fclose(f_last_modified);
        }
+    } else {
+        fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
    }

    // Send a HEAD request to retrieve the etag and last-modified headers
    struct llama_load_model_from_url_headers {
-        char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
-        char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+        std::string etag;
+        std::string last_modified;
    };
    llama_load_model_from_url_headers headers;
    {
@@ -1765,38 +1997,37 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;

-            // Convert header field name to lowercase
-            for (size_t i = 0; i < n_items && buffer[i] != ':'; ++i) {
-                buffer[i] = tolower(buffer[i]);
-            }
+            static std::regex header_regex("([^:]+): (.*)\r\n");
+            static std::regex etag_regex("ETag", std::regex_constants::icase);
+            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);

-            const char * etag_prefix = "etag: ";
-            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
-                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
-            }
-
-            const char * last_modified_prefix = "last-modified: ";
-            if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
-                strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
-                        n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
+            std::string header(buffer, n_items);
+            std::smatch match;
+            if (std::regex_match(header, match, header_regex)) {
+                const std::string & key = match[1];
+                const std::string & value = match[2];
+                if (std::regex_match(key, match, etag_regex)) {
+                    headers->etag = value;
+                } else if (std::regex_match(key, match, last_modified_regex)) {
+                    headers->last_modified = value;
+                }
            }
            return n_items;
        };

-        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);

-        CURLcode res = curl_easy_perform(curl);
+        CURLcode res = curl_easy_perform(curl.get());
        if (res != CURLE_OK) {
-            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return false;
        }

        long http_code = 0;
-        curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code != 200) {
            // HEAD not supported, we don't know if the file has changed
            // force trigger downloading
@@ -1805,28 +2036,30 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        }
    }

-    // If the ETag or the Last-Modified headers are different: trigger a new download
-    bool should_download = !file_exists
-        || force_download
-        || (strlen(headers.etag) > 0 && strcmp(etag, headers.etag) != 0)
-        || (strlen(headers.last_modified) > 0 && strcmp(last_modified, headers.last_modified) != 0);
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
+        if (!etag.empty() && etag != headers.etag) {
+            fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            should_download = true;
+        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+            fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            should_download = true;
+        }
+    }
    if (should_download) {
-        char path_temporary[PATH_MAX] = {0};
-        snprintf(path_temporary, sizeof(path_temporary), "%s.downloadInProgress", path);
+        std::string path_temporary = path + ".downloadInProgress";
        if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path);
-            if (remove(path) != 0) {
-                curl_easy_cleanup(curl);
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path);
+            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
                return false;
            }
        }

        // Set the output file
-        auto * outfile = fopen(path_temporary, "wb");
+        std::unique_ptr<FILE, decltype(&fclose)> outfile(fopen(path_temporary.c_str(), "wb"), fclose);
        if (!outfile) {
-            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path);
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
            return false;
        }

@@ -1834,12 +2067,12 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path
        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
            return fwrite(data, size, nmemb, (FILE *)fd);
        };
-        curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());

        //  display download progress
-        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);

        // helper function to hide password in URL
        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
@@ -1858,51 +2091,34 @@ static bool llama_download_file(CURL * curl, const char * url, const char * path

        // start the download
        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                llama_download_hide_password_in_url(url).c_str(), path, headers.etag, headers.last_modified);
-        auto res = curl_easy_perform(curl);
+                llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        auto res = curl_easy_perform(curl.get());
        if (res != CURLE_OK) {
-            fclose(outfile);
-            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
            return false;
        }

        long http_code = 0;
-        curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
        if (http_code < 200 || http_code >= 400) {
-            fclose(outfile);
-            curl_easy_cleanup(curl);
            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
            return false;
        }

-        // Clean up
-        fclose(outfile);
+        // Causes file to be closed explicitly here before we rename it.
+        outfile.reset();

-        // Write the new ETag to the .etag file
-        if (strlen(headers.etag) > 0) {
-            auto * etag_file = fopen(etag_path, "w");
-            if (etag_file) {
-                fputs(headers.etag, etag_file);
-                fclose(etag_file);
-                fprintf(stderr, "%s: file etag saved %s: %s\n", __func__, etag_path, headers.etag);
-            }
-        }
+        // Write the updated JSON metadata file.
+        metadata.update({
+            {"url", url},
+            {"etag", headers.etag},
+            {"lastModified", headers.last_modified}
+        });
+        std::ofstream(metadata_path) << metadata.dump(4);
+        fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());

-        // Write the new lastModified to the .etag file
-        if (strlen(headers.last_modified) > 0) {
-            auto * last_modified_file = fopen(last_modified_path, "w");
-            if (last_modified_file) {
-                fputs(headers.last_modified, last_modified_file);
-                fclose(last_modified_file);
-                fprintf(stderr, "%s: file last modified saved %s: %s\n", __func__, last_modified_path,
-                        headers.last_modified);
-            }
-        }
-
-        if (rename(path_temporary, path) != 0) {
-            curl_easy_cleanup(curl);
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary, path);
+        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
            return false;
        }
    }
@@ -1920,15 +2136,7 @@ struct llama_model * llama_load_model_from_url(
        return NULL;
    }

-    // Initialize libcurl
-    auto * curl = curl_easy_init();
-
-    if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
-        return NULL;
-    }
-
-    if (!llama_download_file(curl, model_url, path_model)) {
+    if (!llama_download_file(model_url, path_model)) {
        return NULL;
    }

@@ -1942,7 +2150,6 @@ struct llama_model * llama_load_model_from_url(
        auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
        if (!ctx_gguf) {
            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
-            curl_easy_cleanup(curl);
            return NULL;
        }

@@ -1954,8 +2161,6 @@ struct llama_model * llama_load_model_from_url(
        gguf_free(ctx_gguf);
    }

-    curl_easy_cleanup(curl);
-
    if (n_split > 1) {
        char split_prefix[PATH_MAX] = {0};
        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
@@ -1986,11 +2191,7 @@ struct llama_model * llama_load_model_from_url(
                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);

-                auto * curl = curl_easy_init();
-                bool res = llama_download_file(curl, split_url, split_path);
-                curl_easy_cleanup(curl);
-
-                return res;
+                return llama_download_file(split_url, split_path);
            }, idx));
        }

@@ -2121,7 +2322,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
    }

-    {
+    if (params.warmup) {
        LOG("warming up the model with an empty run\n");

        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
@@ -2141,23 +2342,23 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
+                        bool   add_special,
+                        bool   parse_special) {
+    return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
 }

 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
+                        bool   add_special,
+                        bool   parse_special) {
    // upper limit for the number of tokens
-    int n_tokens = text.length() + add_bos;
+    int n_tokens = text.length() + 2 * add_special;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -2165,12 +2366,12 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }

-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -2477,7 +2678,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
-    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
+    fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
@@ -2512,6 +2713,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
+    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);

    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
--- a/common/common.h
+++ b/common/common.h
@@ -31,6 +31,8 @@
    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const *LLAMA_COMMIT;
@@ -39,6 +41,7 @@ extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

+int get_math_cpu_count();
 int32_t get_num_physical_cores();

 //
@@ -48,7 +51,7 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

-    int32_t n_threads             = get_num_physical_cores();
+    int32_t n_threads             = get_math_cpu_count();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
@@ -80,15 +83,18 @@ struct gpt_params {
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold

+    ggml_backend_sched_eval_callback cb_eval = nullptr;
+    void * cb_eval_user_data                 = nullptr;
+
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;

-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model                = "";  // model path
    std::string model_draft          = "";  // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
    std::string model_url            = "";  // model url to download
@@ -129,7 +135,7 @@ struct gpt_params {
    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed

-    bool   kl_divergence   = false; // compute KL-divergence
+    bool   kl_divergence   = false; // compute KL divergence

    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
@@ -144,6 +150,7 @@ struct gpt_params {
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
+    bool flash_attn        = false; // flash attention

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
@@ -156,15 +163,21 @@ struct gpt_params {
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
+    bool warmup            = true;  // warmup run
+    bool check_tensors     = false; // validate tensor data

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
-    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
+    std::string mmproj = "";        // path to multimodal projector
+    std::vector<std::string> image; // path to image file(s)
 };

+void gpt_params_handle_model_default(gpt_params & params);
+
+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -179,6 +192,8 @@ std::string gpt_random_prompt(std::mt19937 & rng);

 void process_escapes(std::string& input);

+bool validate_file_name(const std::string & filename);
+
 //
 // String utils
 //
@@ -186,6 +201,7 @@ void process_escapes(std::string& input);
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
+std::string string_strip(const std::string & str);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);

 //
@@ -221,20 +237,21 @@ void llama_batch_add(
 std::vector<llama_token> llama_tokenize(
  const struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_special,
+                        bool   parse_special = false);

 std::vector<llama_token> llama_tokenize(
    const struct llama_model * model,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_special,
+                        bool   parse_special = false);

-// tokenizes a token into a piece
+// tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
        const struct llama_context * ctx,
-                       llama_token   token);
+                       llama_token   token,
+                       bool          special = true);

 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,35 +11,101 @@

 using json = nlohmann::ordered_json;

+template <typename Iterator>
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+    if (separator_rule.empty()) {
+        if (min_items == 0 && max_items == 1) {
+            return item_rule + "?";
+        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
+            return item_rule + "+";
+        }
+    }
+
+    std::string result;
+    if (min_items > 0) {
+        if (item_rule_is_literal && separator_rule.empty()) {
+            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+        } else {
+            std::vector<std::string> items(min_items, item_rule);
+            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+        }
+    }
+
+    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+        if (up_to_n == 0) {
+            return "";
+        } else if (up_to_n == 1) {
+            return "(" + content + ")?";
+        } else if (!separator_rule.empty() && !prefix_with_sep) {
+            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+        } else {
+            std::string res = repeat("(" + content + " ", up_to_n);
+            // strip trailing space
+            res = res.substr(0, res.length() - 1);
+            res += repeat(")?", up_to_n);
+            return res;
+        }
+    };
+
+    if (min_items > 0 && max_items != min_items) {
+        result += " ";
+    }
+
+    if (max_items != std::numeric_limits<int>::max()) {
+        result += opt_repetitions(max_items - min_items, min_items > 0);
+    } else {
+        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+        if (min_items == 0 && !separator_rule.empty()) {
+            result = "(" + item_rule + " " + item_operator + "*)?";
+        } else {
+            result += item_operator + "*";
+        }
+    }
+
+    return result;
+}
+
 const std::string SPACE_RULE = "\" \"?";

-std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
-    {"boolean", "(\"true\" | \"false\") space"},
-    {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
-    {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
-    {"value", "object | array | string | number | boolean"},
-    {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
-    {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
-    {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
-    {"string", " \"\\\"\" (\n"
-               "        [^\"\\\\] |\n"
-               "        \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
-               "      )* \"\\\"\" space"},
-    {"null", "\"null\" space"}
+struct BuiltinRule {
+    std::string content;
+    std::vector<std::string> deps;
 };
-std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};

-std::unordered_map<std::string, std::string> DATE_RULES = {
-    {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
-    {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
-    {"date-time", "date \"T\" time"},
-    {"date-string", "\"\\\"\" date \"\\\"\" space"},
-    {"time-string", "\"\\\"\" time \"\\\"\" space"},
-    {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
+std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
+    {"boolean", {"(\"true\" | \"false\") space", {}}},
+    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+    {"null", {"\"null\" space", {}}},
+};
+
+std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
+    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"date-time", {"date \"T\" time", {"date", "time"}}},
+    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
 };

 static bool is_reserved_name(const std::string & name) {
@@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
    if (RESERVED_NAMES.empty()) {
        RESERVED_NAMES.insert("root");
        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
    }
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
@@ -192,7 +258,7 @@ private:
                if (_dotall) {
                    rule = "[\\U00000000-\\U0010FFFF]";
                } else {
-                    rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+                    rule = "[^\\x0A\\x0D]";
                }
                return _add_rule("dot", rule);
            };
@@ -308,47 +374,21 @@ private:
                    auto &sub = last.first;
                    auto sub_is_literal = last.second;

-                    if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
-                        sub += "*";
-                    } else if (min_times == 0 && max_times == 1) {
-                        sub += "?";
-                    } else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
-                        sub += "+";
-                    } else {
-                        if (!sub_is_literal) {
-                            std::string & sub_id = sub_rule_ids[sub];
-                            if (sub_id.empty()) {
-                                sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
-                            }
-                            sub = sub_id;
+                    if (!sub_is_literal) {
+                        std::string & sub_id = sub_rule_ids[sub];
+                        if (sub_id.empty()) {
+                            sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
                        }
-                        std::string result;
-                        if (sub_is_literal && min_times > 0) {
-                            result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
-                        } else {
-                            for (int j = 0; j < min_times; j++) {
-                                if (j > 0) {
-                                    result += " ";
-                                }
-                                result += sub;
-                            }
-                        }
-                        if (min_times > 0 && min_times < max_times) {
-                            result += " ";
-                        }
-                        if (max_times == std::numeric_limits<int>::max()) {
-                            result += sub + "*";
-                        } else {
-                            for (int j = min_times; j < max_times; j++) {
-                                if (j > min_times) {
-                                    result += " ";
-                                }
-                                result += sub + "?";
-                            }
-                        }
-                        seq.back().first = result;
-                        seq.back().second = false;
+                        sub = sub_id;
                    }
+                    seq.back().first = build_repetition(
+                        sub_is_literal ? "\"" + sub + "\"" : sub,
+                        min_times,
+                        max_times,
+                        "",
+                        sub_is_literal
+                    );
+                    seq.back().second = false;
                } else {
                    std::string literal;
                    auto is_non_literal = [&](char c) {
@@ -424,7 +464,7 @@ private:
        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
            prop_kv_rule_names["*"] = kv_rule;
            optional_props.push_back("*");
        }
@@ -486,6 +526,25 @@ private:
        return rule;
    }

+    std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
+        auto n = _add_rule(name, rule.content);
+        for (const auto & dep : rule.deps) {
+            BuiltinRule dep_rule;
+            auto it = PRIMITIVE_RULES.find(dep);
+            if (it == PRIMITIVE_RULES.end()) {
+                it = STRING_FORMAT_RULES.find(dep);
+                if (it == STRING_FORMAT_RULES.end()) {
+                    _errors.push_back("Rule " + dep + " not known");
+                    continue;
+                }
+            }
+            if (_rules.find(dep) == _rules.end()) {
+                _add_primitive(dep, it->second);
+            }
+        }
+        return n;
+    }
+
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
@@ -647,49 +706,33 @@ public:
                return _add_rule(rule_name, rule);
            } else {
                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
-                std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
-                std::string successive_items;
                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
-                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
-                if (min_items > 0) {
-                    successive_items += repeat(list_item_operator, min_items - 1);
-                    min_items--;
-                }
-                if (max_items >= 0 && max_items > min_items) {
-                    successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
-                } else {
-                    successive_items += list_item_operator + "*";
-                }
-                std::string rule;
-                if (min_items == 0) {
-                    rule =  "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
-                } else {
-                    rule =  "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
-                }
-                return _add_rule(rule_name, rule);
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
+
+                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
            }
        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
-            return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
-        } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
-            for (const auto & kv : DATE_RULES) {
-                _add_rule(kv.first, kv.second);
-            }
-            return schema_format + "-string";
+            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+            auto prim_name = schema_format + "-string";
+            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
+            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema.empty() || schema_type == "object") {
-            for (const auto & n : OBJECT_RULE_NAMES) {
-                _add_rule(n, PRIMITIVE_RULES.at(n));
-            }
-            return _add_rule(rule_name, "object");
+            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
        } else {
            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
                _errors.push_back("Unrecognized schema: " + schema.dump());
                return "";
            }
            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+            return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
        }
    }

--- a/common/log.h
+++ b/common/log.h
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
    #define LOG_IMPL(str, ...)                                                                                      \
    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,4 +1,6 @@
+#define LLAMA_API_INTERNAL
 #include "sampling.h"
+#include <random>

 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
    struct llama_sampling_context * result = new llama_sampling_context();
@@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

    result->prev.resize(params.n_prev);

+    llama_sampling_set_rng_seed(result, params.seed);
+
    return result;
 }

@@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
    ctx->cur.clear();
 }

+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = std::random_device{}();
+    }
+    ctx->rng.seed(seed);
+}
+
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
    if (dst->grammar) {
        llama_grammar_free(dst->grammar);
@@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(

            sampler_queue(ctx_main, params, cur_p, min_keep);

-            id = llama_sample_token(ctx_main, &cur_p);
+            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);

            //{
            //    const int n_top = 10;
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -4,9 +4,10 @@

 #include "grammar-parser.h"

+#include <random>
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>

 // sampler types
 enum class llama_sampler_type : char {
@@ -20,25 +21,26 @@ enum class llama_sampler_type : char {

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.00f;    // 1.0 = disabled
-    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;    // target entropy
-    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = false;     // consider newlines as a repeatable token
+    int32_t     n_prev                = 64;                 // number of previous tokens to remember
+    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
+    float       top_p                 = 0.95f;              // 1.0 = disabled
+    float       min_p                 = 0.05f;              // 0.0 = disabled
+    float       tfs_z                 = 1.00f;              // 1.0 = disabled
+    float       typical_p             = 1.00f;              // 1.0 = disabled
+    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
+    float       penalty_freq          = 0.00f;              // 0.0 = disabled
+    float       penalty_present       = 0.00f;              // 0.0 = disabled
+    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;              // target entropy
+    float       mirostat_eta          = 0.10f;              // learning rate
+    bool        penalize_nl           = false;              // consider newlines as a repeatable token
+    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context

    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
@@ -79,6 +81,8 @@ struct llama_sampling_context {
    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
+
+    std::mt19937 rng;
 };

 #include "common.h"
@@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
 // - reset grammar
 void llama_sampling_reset(llama_sampling_context * ctx);

+// Set the sampler seed
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
+
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

@@ -129,7 +136,7 @@ llama_token llama_sampling_sample(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx = -1);

 // Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
 llama_token_data_array llama_sampling_prepare(
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -0,0 +1,279 @@
+# This script downloads the tokenizer models of the specified models from Huggingface and
+# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
+#
+# This is necessary in order to analyze the type of pre-tokenizer used by the model and
+# provide the necessary information to llama.cpp via the GGUF header in order to implement
+# the same pre-tokenizer.
+#
+# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+#
+# Instructions:
+#
+# - Add a new model to the "models" list
+# - Run the script with your huggingface token:
+#
+#   python3 convert-hf-to-gguf-update.py <huggingface_token>
+#
+# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
+# - Update llama.cpp with the new pre-tokenizer if necessary
+#
+# TODO: generate tokenizer tests for llama.cpp
+# TODO: automate the update of convert-hf-to-gguf.py
+#
+
+import os
+import requests
+import sys
+import json
+
+from hashlib import sha256
+from enum import IntEnum, auto
+
+class TOKENIZER_TYPE(IntEnum):
+    SPM = auto()
+    BPE = auto()
+    WPM = auto()
+
+# TODO: this string has to exercise as much pre-tokenizer functionality as possible
+#       will be updated with time - contributions welcome
+chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+if len(sys.argv) == 2:
+    token = sys.argv[1]
+else:
+    print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+    sys.exit(1)
+
+# TODO: add models here, base models preferred
+models = [
+        { "name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
+        { "name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
+        { "name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
+        { "name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
+        { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
+        { "name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
+        { "name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
+        { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
+        { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
+        { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+        ]
+
+# make directory "models/tokenizers" if it doesn't exist
+if not os.path.exists("models/tokenizers"):
+    os.makedirs("models/tokenizers")
+
+def download_file_with_auth(url, token, save_path):
+    headers = {"Authorization": f"Bearer {token}"}
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+        print(f"File {save_path} downloaded successfully")
+    else:
+        print(f"Failed to download file. Status code: {response.status_code}")
+
+# download the tokenizer models
+for model in models:
+    name = model["name"]
+    repo = model["repo"]
+    tokt = model["tokt"]
+
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        os.makedirs(f"models/tokenizers/{name}")
+    else:
+        print(f"Directory models/tokenizers/{name} already exists - skipping")
+        continue
+
+    print(f"Downloading {name} to models/tokenizers/{name}")
+
+    url = f"{repo}/raw/main/config.json"
+    save_path = f"models/tokenizers/{name}/config.json"
+    download_file_with_auth(url, token, save_path)
+
+    url = f"{repo}/raw/main/tokenizer.json"
+    save_path = f"models/tokenizers/{name}/tokenizer.json"
+    download_file_with_auth(url, token, save_path)
+
+    if tokt == TOKENIZER_TYPE.SPM:
+        url = f"{repo}/resolve/main/tokenizer.model"
+        save_path = f"models/tokenizers/{name}/tokenizer.model"
+        download_file_with_auth(url, token, save_path)
+
+    url = f"{repo}/raw/main/tokenizer_config.json"
+    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
+    download_file_with_auth(url, token, save_path)
+
+# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
+# TODO: auto-update convert-hf-to-gguf.py with the generated function
+
+src_ifs = ""
+for model in models:
+    name = model["name"]
+    tokt = model["tokt"]
+
+    if tokt == TOKENIZER_TYPE.SPM:
+        continue
+
+    # create the tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+
+    chktok = tokenizer.encode(chktxt)
+    chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+    print(f"model: {name}")
+    print(f"tokt: {tokt}")
+    print(f"repo: {model['repo']}")
+    print(f"chktok: {chktok}")
+    print(f"chkhsh: {chkhsh}")
+
+    # print the "pre_tokenizer" content from the tokenizer.json
+    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
+        cfg = json.load(f)
+        pre_tokenizer = cfg["pre_tokenizer"]
+        print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+
+    print(f"\n")
+
+    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
+    src_ifs += f"            # ref: {model['repo']}\n"
+    src_ifs += f"            res = \"{name}\"\n"
+
+src_func = ""
+src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n"
+src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
+src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n"
+src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
+src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n"
+src_func +=  "\n"
+src_func += f"        chktxt = {repr(chktxt)}\n"
+src_func +=  "\n"
+src_func +=  "        chktok = tokenizer.encode(chktxt)\n"
+src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
+src_func +=  "\n"
+src_func +=  "        print(f\"chktok: {chktok}\")\n"
+src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
+src_func +=  "\n"
+src_func +=  "        res = None\n"
+src_func +=  "\n"
+src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
+src_func +=  "        #       or pull the latest version of the model from Huggingface\n"
+src_func +=  "        #       don't edit the hashes manually!\n"
+src_func += f"{src_ifs}\n"
+src_func +=  "        if res is None:\n"
+src_func +=  "            print(\"\\n\")\n"
+src_func +=  "            print(\"**************************************************************************************\")\n"
+src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
+src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n"
+src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
+src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n"
+src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
+src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
+src_func +=  "            print(\"**\")\n"
+src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
+src_func +=  "            print(\"**************************************************************************************\")\n"
+src_func +=  "            print(\"\\n\")\n"
+src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
+src_func +=  "\n"
+src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n"
+src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
+src_func +=  "\n"
+src_func +=  "        return res\n"
+
+print(src_func)
+
+print("\n")
+print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+print("\n")
+
+# generate tests for each tokenizer model
+
+tests = [
+    "",
+    " ",
+    "  ",
+    "   ",
+    "\t",
+    "\n",
+    "\n\n",
+    "\n\n\n",
+    "\t\n",
+    "Hello world",
+    " Hello world",
+    "Hello World",
+    " Hello World",
+    " Hello World!",
+    "Hello, world!",
+    " Hello, world!",
+    " this is 🦙.cpp",
+    "w048 7tuijk dsdfhu",
+    "нещо на Български",
+    "កាន់តែពិសេសអាចខលចេញ",
+    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+    "Hello",
+    " Hello",
+    "  Hello",
+    "   Hello",
+    "    Hello",
+    "    Hello\n    Hello",
+    " (",
+    "\n =",
+    "' era",
+    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+    "3",
+    "33",
+    "333",
+    "3333",
+    "33333",
+    "333333",
+    "3333333",
+    "33333333",
+    "333333333",
+    chktxt,
+]
+
+# write the tests to ./models/ggml-vocab-{name}.gguf.inp
+# the format is:
+#
+# test0
+# __ggml_vocab_test__
+# test1
+# __ggml_vocab_test__
+# ...
+#
+
+# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
+# for each test, write the resulting tokens on a separate line
+
+for model in models:
+    name = model["name"]
+    tokt = model["tokt"]
+
+    # create the tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+
+    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
+        for text in tests:
+            f.write(f"{text}")
+            f.write("\n__ggml_vocab_test__\n")
+
+    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
+        for text in tests:
+            res = tokenizer.encode(text, add_special_tokens=False)
+            for r in res:
+                f.write(f" {r}")
+            f.write("\n")
+
+    print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+
+# generate commands for creating vocab files
+
+print("\nRun the following commands to generate the vocab files for testing:\n")
+
+for model in models:
+    name = model["name"]
+
+    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
+
+print("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -11,6 +11,7 @@ import sys
 from abc import ABC, abstractmethod
 from enum import IntEnum
 from pathlib import Path
+from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast

 import numpy as np
@@ -43,17 +44,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
 class Model(ABC):
    _model_classes: dict[str, type[Model]] = {}

-    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
+    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
        self.dir_model = dir_model
        self.ftype = ftype
        self.fname_out = fname_out
        self.is_big_endian = is_big_endian
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.use_temp_file = use_temp_file
        self.is_safetensors = self._is_model_safetensors()
        self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])

    @property
@@ -160,7 +162,7 @@ class Model(ABC):
                data = data.astype(np.float32)

            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
                data = data.astype(np.float32)

            # if f16 desired, convert any float32 2-dim weight tensors to float16
@@ -227,17 +229,18 @@ class Model(ABC):
            return ("pytorch_model.bin",)
        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))

-    def _set_vocab_gpt2(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
+    # used for GPT-2 BPE and WordPiece vocabs
+    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
        tokens: list[str] = []
        toktypes: list[int] = []

        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model)
-        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size

+        tokpre = self.get_vocab_base_pre(tokenizer)
+
        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()

@@ -255,11 +258,83 @@ class Model(ABC):
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)

+        return tokens, toktypes, tokpre
+
+    # NOTE: this function is generated by convert-hf-to-gguf-update.py
+    #       do not modify it manually!
+    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    def get_vocab_base_pre(self, tokenizer) -> str:
+        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+        # is specific for the BPE pre-tokenizer used by the model
+        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+        # use in llama.cpp to implement the same pre-tokenizer
+
+        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+        chktok = tokenizer.encode(chktxt)
+        chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+        print(f"chktok: {chktok}")
+        print(f"chkhsh: {chkhsh}")
+
+        res = None
+
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
+        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+            res = "llama-bpe"
+        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+            res = "deepseek-llm"
+        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+            res = "deepseek-coder"
+        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+            # ref: https://huggingface.co/tiiuae/falcon-7b
+            res = "falcon"
+        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
+            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
+            res = "bert-bge"
+        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
+            # ref: https://huggingface.co/mosaicml/mpt-7b
+            res = "mpt"
+        if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
+            # ref: https://huggingface.co/bigcode/starcoder2-3b
+            res = "starcoder"
+        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
+            # ref: https://huggingface.co/openai-community/gpt2
+            res = "gpt-2"
+
+        if res is None:
+            print("\n")
+            print("**************************************************************************************")
+            print("** WARNING: The BPE pre-tokenizer was not recognized!")
+            print("**          There are 2 possible reasons for this:")
+            print("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            print("**          - the pre-tokenization config has changed upstream")
+            print("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            print("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            print("**")
+            print(f"** chkhsh:  {chkhsh}")
+            print("**************************************************************************************")
+            print("\n")
+            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
+
+        print(f"tokenizer.ggml.pre: {res}")
+        print(f"chkhsh: {chkhsh}")
+
+        return res
+
+    def _set_vocab_gpt2(self) -> None:
+        tokens, toktypes, tokpre = self.get_vocab_base()
        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)

-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)

    def _set_vocab_qwen(self):
@@ -273,6 +348,8 @@ class Model(ABC):
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size

+        tokpre = self.get_vocab_base_pre(tokenizer)
+
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
@@ -300,6 +377,7 @@ class Model(ABC):
                toktypes.append(gguf.TokenType.NORMAL)

        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)

@@ -359,9 +437,20 @@ class Model(ABC):
                        scores.append(-1000.0)
                        toktypes.append(SentencePieceTokenTypes.USER_DEFINED)

+        if vocab_size > len(tokens):
+            pad_count = vocab_size - len(tokens)
+            print(
+                f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
+            )
+            for i in range(1, pad_count + 1):
+                tokens.append(f"[PAD{i}]")
+                scores.append(-1000.0)
+                toktypes.append(SentencePieceTokenTypes.UNUSED)
+
        assert len(tokens) == vocab_size

        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@@ -383,6 +472,7 @@ class Model(ABC):
        assert len(tokens) == vocab.vocab_size

        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@@ -826,6 +916,7 @@ class XverseModel(Model):
            toktypes.append(toktype)

        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)

@@ -1203,9 +1294,91 @@ class StableLMModel(Model):
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))

+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_head = self.hparams.get("num_attention_heads")
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        q_norms = dict()
+        k_norms = dict()
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+            n_dims = len(data.shape)
+            if name.find("q_layernorm.norms") != -1:
+                q_norms[name] = data
+                if len(q_norms) >= (block_count * n_head):
+                    self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
+                continue
+            if name.find("k_layernorm.norms") != -1:
+                k_norms[name] = data
+                if len(k_norms) >= (block_count * n_kv_head):
+                    self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
+                continue
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+    def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
+        for bid in range(block_count):
+            datas = []
+            for xid in range(n_head):
+                ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
+                datas.append(norms[ename])
+                del norms[ename]
+            data = np.stack(datas, axis=0)
+            data_dtype = data.dtype
+            merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
+            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+

@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
@@ -1215,7 +1388,23 @@ class LlamaModel(Model):
        try:
            self. _set_vocab_sentencepiece()
        except FileNotFoundError:
-            self._set_vocab_llama_hf()
+            try:
+                self._set_vocab_llama_hf()
+            except (FileNotFoundError, TypeError):
+                # Llama 3
+                self._set_vocab_gpt2()
+
+        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        if self.hparams.get("vocab_size", 32000) == 32016:
+            special_vocab = gguf.SpecialVocab(
+                self.dir_model, load_merges=False,
+                special_token_types = ['prefix', 'suffix', 'middle', 'eot']
+            )
+            special_vocab._set_special_token("prefix", 32007)
+            special_vocab._set_special_token("suffix", 32008)
+            special_vocab._set_special_token("middle", 32009)
+            special_vocab._set_special_token("eot",    32010)
+            special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
@@ -1223,6 +1412,11 @@ class LlamaModel(Model):
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])

+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
    # Same as super class, but permuting q_proj, k_proj
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -1424,6 +1618,102 @@ class GrokModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+@Model.register("DbrxForCausalLM")
+class DbrxModel(Model):
+    model_arch = gguf.MODEL_ARCH.DBRX
+
+    def set_gguf_parameters(self):
+        ffn_config = self.hparams["ffn_config"]
+        attn_config = self.hparams["attn_config"]
+        self.gguf_writer.add_name(self.hparams["model_type"])
+        self.gguf_writer.add_block_count(self.hparams["n_layers"])
+
+        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
+
+        self.gguf_writer.add_head_count(self.hparams["n_heads"])
+        self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
+
+        self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
+
+        self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+        self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
+        self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
+
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+
+        self.gguf_writer.add_file_type(self.ftype)
+        print(f"gguf: file type = {self.ftype}")
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers")
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        for name, data_torch in self.get_tensors():
+            n_expert = self.hparams["ffn_config"]["moe_num_experts"]
+            n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
+            n_embd = self.hparams["d_model"]
+
+            # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
+            # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
+            # But llama.cpp moe graph works differently
+            # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
+            # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
+            exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
+                                "ffn.experts.mlp.w2": (0, 2, 1),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
+                                "ffn.experts.mlp.v1": None}       # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
+            experts = False
+            for exp_tensor_name in exp_tensor_names.keys():
+                if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
+                    experts = True
+                    data_torch = data_torch.view(n_expert, n_ff, n_embd)
+                    if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
+                        data_torch = data_torch.permute(*permute_tensor)
+                    break
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            # In MoE models the ffn tensors are typically most of the model weights,
+            # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
+            # Every other model has the weight names ending in .weight,
+            # let's assume that is the convention which is not the case for dbrx:
+            # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
+            new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # Most of the codebase that takes in 1D tensors only handles F32 tensors
+            # and most of the outputs tensors are F32.
+            if data_dtype != np.float32 and n_dims == 1:
+                print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
+                sys.exit()
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM
@@ -1591,6 +1881,111 @@ class QwenModel(Model):
 class Qwen2Model(Model):
    model_arch = gguf.MODEL_ARCH.QWEN2

+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+
+@Model.register("Qwen2MoeForCausalLM")
+class Qwen2MoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2MOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_experts = self.hparams.get("num_experts")
+        experts = dict()
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # process the experts separately
+            if name.find("experts") != -1:
+                experts[name] = data
+                if len(experts) >= n_experts * 3:
+                    # merge the experts into a single 3d tensor
+                    for bid in range(block_count):
+                        for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                            full = True
+                            for xid in range(n_experts):
+                                ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                                if ename not in experts:
+                                    full = False
+                                    break
+                            if not full:
+                                continue
+
+                            datas = []
+                            for xid in range(n_experts):
+                                ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                                datas.append(experts[ename])
+                                del experts[ename]
+
+                            data = np.stack(datas, axis=0)
+                            data_dtype = data.dtype
+
+                            if self.ftype == 0 and data_dtype == np.float16:
+                                data = data.astype(np.float32)
+
+                            if self.ftype == 1 and data_dtype == np.float32:
+                                data = data.astype(np.float16)
+
+                            merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+                            if new_name is None:
+                                print(f"Can not map tensor {name!r}")
+                                sys.exit()
+
+                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+                            self.gguf_writer.add_tensor(new_name, data)
+                continue
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+        if len(experts) > 0:
+            raise ValueError(f"Unprocessed experts: {experts.keys()}")
+

@Model.register("GPT2LMHeadModel")
 class GPT2Model(Model):
@@ -1682,6 +2077,92 @@ class Phi2Model(Model):
        self.gguf_writer.add_add_bos_token(False)


+@Model.register("Phi3ForCausalLM")
+class Phi3MiniModel(Model):
+    model_arch = gguf.MODEL_ARCH.PHI3
+
+    def set_vocab(self):
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        if not tokenizer_path.is_file():
+            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
+            sys.exit(1)
+
+        tokenizer = SentencePieceProcessor(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+
+            piece = tokenizer.id_to_piece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.get_score(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.is_unknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.is_control(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.is_unused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.is_byte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+
+                for key in added_tokens_json:
+                    token_id = added_tokens_json[key]
+                    if (token_id >= vocab_size):
+                        print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
+        rot_pct = 1.0
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        rms_eps = self.find_hparam(["rms_norm_eps"])
+
+        self.gguf_writer.add_name("Phi3")
+        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
+
+        self.gguf_writer.add_embedding_length(n_embd)
+        self.gguf_writer.add_feed_forward_length(8192)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+        self.gguf_writer.add_file_type(self.ftype)
+
+
@Model.register("PlamoForCausalLM")
 class PlamoModel(Model):
    model_arch = gguf.MODEL_ARCH.PLAMO
@@ -1896,6 +2377,7 @@ class InternLM2Model(Model):
                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)

        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@@ -1905,6 +2387,8 @@ class InternLM2Model(Model):
        old_eos = special_vocab.special_token_ids["eos"]
        if "chat" in os.path.basename(self.dir_model.absolute()):
            # For the chat model, we replace the eos with '<|im_end|>'.
+            # TODO: this is a hack, should be fixed
+            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
            print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
 in chat mode so that the conversation can end normally.")
@@ -2043,34 +2527,26 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)

    def set_vocab(self):
-        # use huggingface vocab to get all tokens
-        vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
-        tokens, scores, toktypes = zip(*vocab.all_tokens())
-        assert len(tokens) == vocab.vocab_size
-        self.vocab_size = vocab.vocab_size
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.vocab_size = len(tokens)

        # we need this to validate the size of the token_type embeddings
        # though currently we are passing all zeros to the token_type embeddings
-        n_token_types = len(set(toktypes))
-        self.gguf_writer.add_token_type_count(n_token_types)
+        self.gguf_writer.add_token_type_count(2)  # "Sequence A" or "Sequence B"

        # convert to phantom space vocab
-        def phantom(tok, typ):
-            if tok.startswith(b"[") and tok.endswith(b"]"):
+        def phantom(tok):
+            if tok.startswith("[") and tok.endswith("]"):
                return tok
-            if tok.startswith(b"##"):
+            if tok.startswith("##"):
                return tok[2:]
-            return b"\xe2\x96\x81" + tok
-        tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes))
-
-        # set up bos and eos tokens (cls and sep)
-        self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id)
-        self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id)
+            return "\u2581" + tok
+        tokens = list(map(phantom, tokens))

        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)

        # handle special tokens
@@ -2091,6 +2567,10 @@ class BertModel(Model):
                print(f"Can not map tensor {name!r}")
                sys.exit()

+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
            data = data_torch.squeeze().numpy()
            n_dims = len(data.shape)
            new_dtype: type[np.floating[Any]]
@@ -2142,16 +2622,6 @@ class NomicBertModel(BertModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])

-    def get_tensors(self):
-        assert self.vocab_size is not None
-        for name, data in super().get_tensors():
-            # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly.
-            if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size:
-                rounded_vocab_size = (self.vocab_size + 63) // 64 * 64
-                assert data.shape == (rounded_vocab_size, self.hparams["n_embd"])
-                data = data[:self.vocab_size, :]
-            yield name, data
-

@Model.register("GemmaForCausalLM")
 class GemmaModel(Model):
@@ -2160,6 +2630,16 @@ class GemmaModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()

+        # TODO: these special tokens should be exported only for the CodeGemma family
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+                                          special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
+        special_vocab._set_special_token("prefix", 67)
+        special_vocab._set_special_token("suffix", 69)
+        special_vocab._set_special_token("middle", 68)
+        special_vocab._set_special_token("fsep",   70)
+        special_vocab._set_special_token("eot",    107)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@@ -2181,6 +2661,12 @@ class GemmaModel(Model):
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)

        for name, data_torch in self.get_tensors():
+            # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+            # To prevent errors, skip loading lm_head.weight.
+            if name == "lm_head.weight":
+                print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+                continue
+
            old_dtype = data_torch.dtype

            # convert any unsupported data types to float32
@@ -2240,28 +2726,37 @@ class MambaModel(Model):

            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
            self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
+            self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
            self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
            self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])

    def set_gguf_parameters(self):
-        d_model = self.find_hparam(["hidden_size", "d_model"])
-        d_conv  = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+        d_model = self.find_hparam(["hidden_size",       "d_model"])
+        d_conv  = self.find_hparam(["conv_kernel",       "d_conv"],  optional=True) or 4
        d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
-        d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
+        d_state = self.find_hparam(["state_size",        "d_state"], optional=True) or 16
        # ceiling division
        # ref: https://stackoverflow.com/a/17511341/22827863
        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
-        dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+        dt_rank      = self.find_hparam(["time_step_rank",     "dt_rank"],      optional=True) or -(d_model // -16)
        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

        # Fail early for models which don't have a block expansion factor of 2
@@ -2327,7 +2822,8 @@ class MambaModel(Model):
                data = data.astype(np.float32)

            # if f16 desired, convert big float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
+            new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
+            if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
                data = data.astype(np.float16)

            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
@@ -2352,6 +2848,66 @@ class CommandR2Model(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)


+@Model.register("OlmoForCausalLM")
+@Model.register("OLMoForCausalLM")
+class OlmoModel(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+        if "clip_qkv" in self.hparams is not None:
+            self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
+
+    # Same as super class, but permuting q_proj, k_proj
+    # Copied from: LlamaModel
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_head = self.hparams.get("num_attention_heads")
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        for name, data_torch in self.get_tensors():
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.numpy()
+
+            if name.endswith("q_proj.weight"):
+                data = permute(data, n_head, n_head)
+            if name.endswith("k_proj.weight"):
+                data = permute(data, n_head, n_kv_head)
+
+            data = data.squeeze()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # 1d tensors need to be converted to float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
 ###### CONVERSION LOGIC ######


@@ -2378,6 +2934,8 @@ def parse_args() -> argparse.Namespace:
        "model", type=Path,
        help="directory containing model file",
    )
+    parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
+    parser.add_argument("--model-name", type=str, default=None, help="name of the model")

    return parser.parse_args()

@@ -2421,7 +2979,7 @@ def main() -> None:

    with torch.inference_mode():
        model_class = Model.from_model_architecture(hparams["architectures"][0])
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)

        print("Set model parameters")
        model_instance.set_gguf_parameters()
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -281,6 +281,7 @@ class GGMLToGGUF:
    def add_vocab(self, gguf_writer):
        hp = self.model.hyperparameters
        gguf_writer.add_tokenizer_model('llama')
+        gguf_writer.add_tokenizer_pre('default')
        tokens = []
        scores = []
        toktypes = []
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+from __future__ import annotations
+
 import argparse
 import os
 import sys
@@ -97,6 +99,7 @@ def main():

    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
    gguf_writer.add_tokenizer_model('llama')
+    gguf_writer.add_tokenizer_pre('default')
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
--- a/convert.py
+++ b/convert.py
@@ -33,7 +33,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf

 if TYPE_CHECKING:
-    from typing import TypeAlias
+    from typing_extensions import Self, TypeAlias

 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)
@@ -139,7 +139,8 @@ class GGMLFileType(enum.IntEnum):
        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
        if dt is None:
            raise ValueError(self)
-        # 1D tensors are always F32.
+        # Convert all 1D tensors to F32.  Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
+        #  Also The 1d tensors aren't much of a performance/size issue.  So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
        return dt if len(tensor.shape) > 1 else DT_F32


@@ -516,7 +517,7 @@ class LlamaHfVocab(Vocab):
    tokenizer_model = "llama"
    name = "hfft"

-    def __init__(self, base_path: Path, ignore_nonllama: bool = False):
+    def __init__(self, base_path: Path):
        fname_tokenizer = base_path / FAST_TOKENIZER_FILE
        # if this fails, FileNotFoundError propagates to caller
        with open(fname_tokenizer, encoding='utf-8') as f:
@@ -524,9 +525,14 @@ class LlamaHfVocab(Vocab):

        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if ignore_nonllama:
-            pass  # workaround incorrect use of this class for WordPiece
-        elif (
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
@@ -646,16 +652,17 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:


 class Tensor(ABC):
+    ndarray: NDArray
    data_type: DataType

    @abstractmethod
-    def astype(self, data_type: DataType) -> Tensor: ...
+    def astype(self, data_type: DataType) -> Self: ...
    @abstractmethod
-    def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
+    def permute(self, n_head: int, n_head_kv: int) -> Self: ...
    @abstractmethod
-    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
    @abstractmethod
-    def part(self, n_part: int) -> UnquantizedTensor: ...
+    def part(self, n_part: int) -> Self: ...
    @abstractmethod
    def to_ggml(self) -> GGMLCompatibleTensor: ...

@@ -672,13 +679,13 @@ class UnquantizedTensor(Tensor):
        self.ndarray = ndarray
        self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]

-    def astype(self, data_type: DataType) -> Tensor:
+    def astype(self, data_type: DataType) -> UnquantizedTensor:
        dtype = data_type.dtype
        if self.data_type == DT_BF16:
            self.ndarray = bf16_to_fp32(self.ndarray)
        return UnquantizedTensor(self.ndarray.astype(dtype))

-    def to_ggml(self) -> UnquantizedTensor:
+    def to_ggml(self) -> Self:
        return self

    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
@@ -1350,7 +1357,7 @@ def load_some_model(path: Path) -> ModelPlus:
    # Be extra-friendly and accept either a file or a directory:
    if path.is_dir():
        # Check if it's a set of safetensors files first
-        globs = ["model-00001-of-*.safetensors", "model.safetensors"]
+        globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
        files = [file for glob in globs for file in path.glob(glob)]
        if not files:
            # Try the PyTorch patterns too, with lower priority
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -0,0 +1,119 @@
+## Add a new model architecture to `llama.cpp`
+
+Adding a model requires few steps:
+
+1. Convert the model to GGUF
+2. Define the model architecture in `llama.cpp`
+3. Build the GGML graph implementation
+
+After following these steps, you can open PR.
+
+Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially:
+- [main](../examples/main)
+- [imatrix](../examples/imatrix)
+- [quantize](../examples/quantize)
+- [server](../examples/server)
+
+### 1. Convert the model to GGUF
+
+This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
+Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+
+The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
+
+The required steps to implement for an HF model are:
+
+1. Define the model `Model.register` annotation in a new `Model` subclass, example:
+
+```python
+@Model.register("MyModelForCausalLM")
+class MyModel(Model):
+    model_arch = gguf.MODEL_ARCH.GROK
+```
+
+2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py)
+
+Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`.
+
+Example for `falcon` model:
+```python
+    MODEL_ARCH.FALCON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ]
+```
+
+3. Map the original tensor names to the standardize equivalent in GGUF
+
+As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist.
+
+Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file.
+
+If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it.
+
+Example for the normalization tensor in attention layers:
+
+```python
+block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Attention norm
+        MODEL_TENSOR.ATTN_NORM: (
+            "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
+            "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact qwen
+            "transformer.blocks.{bid}.norm_1",                      # mpt
+            ...
+        )
+}
+```
+
+`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF.
+
+Depending on the model configuration, tokenizer, code and tensors layout, you will have to override:
+- `Model#set_gguf_parameters`
+- `Model#set_vocab`
+- `Model#write_tensors`
+
+NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights.
+
+### 2. Define the model architecture in `llama.cpp`
+
+The model params and tensors layout must be defined in `llama.cpp`:
+1. Define a new `llm_arch`
+2. Define the tensors layout in `LLM_TENSOR_NAMES`
+3. Add any non standard metadata in `llm_load_hparams`
+4. Create the tensors for inference in `llm_load_tensors`
+5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+
+NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.
+
+### 3. Build the GGML graph implementation
+
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
+
+Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
+
+When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
+
+Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
+
+## GGUF specification
+
+https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+
+## Resources
+
+- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
+- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
+    add_subdirectory(eval-callback)
    add_subdirectory(finetune)
    add_subdirectory(gritlm)
    add_subdirectory(gguf-split)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)

 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>

 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99

 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99

 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
 ```

 ## Sample results
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,13 +32,16 @@ int main(int argc, char ** argv) {
    gpt_params params;

    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
        return 1 ;
    }

    int n_kv_max     = 2048;
+    int n_batch      = 2048;
+    int n_ubatch     = 512;
+    bool flash_attn  = false;
    int is_pp_shared = 0;
    int n_gpu_layers = 0;

@@ -56,23 +59,35 @@ int main(int argc, char ** argv) {
    }

    if (argc >= 4) {
-        is_pp_shared = std::atoi(argv[3]);
+        n_batch = std::atoi(argv[3]);
    }

    if (argc >= 5) {
-        n_gpu_layers = std::atoi(argv[4]);
+        n_ubatch = std::atoi(argv[4]);
    }

    if (argc >= 6) {
-        n_pp = parse_list(argv[5]);
+        flash_attn = std::atoi(argv[5]);
    }

    if (argc >= 7) {
-        n_tg = parse_list(argv[6]);
+        is_pp_shared = std::atoi(argv[6]);
    }

    if (argc >= 8) {
-        n_pl = parse_list(argv[7]);
+        n_gpu_layers = std::atoi(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_pp = parse_list(argv[8]);
+    }
+
+    if (argc >= 10) {
+        n_tg = parse_list(argv[9]);
+    }
+
+    if (argc >= 11) {
+        n_pl = parse_list(argv[10]);
    }

    // init LLM
@@ -98,9 +113,11 @@ int main(int argc, char ** argv) {

    llama_context_params ctx_params = llama_context_default_params();

-    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
+    ctx_params.seed       = 1234;
+    ctx_params.n_ctx      = n_kv_max;
+    ctx_params.n_batch    = n_batch;
+    ctx_params.n_ubatch   = n_ubatch;
+    ctx_params.flash_attn = flash_attn;

    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -158,7 +175,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
@@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
            model,
            token,
            &result,
-            Int32(result.count)
+            Int32(result.count),
+            false
        )
        assert(check == actualTokensCount)
    } else {
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {

            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation? -> mark the stream as finished
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
+    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
 }

 // Function matching type llama_beam_search_callback_fn_t.
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
        inputs.push_back(inp);
    }

-    // add eos if not present
+    // add SEP if not present
    for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
-            inp.push_back(llama_token_eos(model));
+        if (inp.empty() || inp.back() != llama_token_sep(model)) {
+            inp.push_back(llama_token_sep(model));
        }
    }

--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET eval-callback)
+add_executable(${TARGET} eval-callback.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TEST_TARGET test-eval-callback)
+add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@@ -0,0 +1,95 @@
+# llama.cpp/examples/eval-callback
+
+A simple example which demonstrates how to use callback during the inference.
+It simply prints to the console all operations and tensor data.
+
+Usage:
+
+```shell
+eval-callback \
+  --hf-repo ggml-org/models \
+  --hf-file phi-2/ggml-model-q4_0.gguf \
+  --model phi-2-q4_0.gguf \
+  --prompt hello \
+  --seed 42 \
+  -ngl 33
+```
+
+Will print:
+
+```shell
+llm_load_tensors: offloaded 33/33 layers to GPU
+...
+llama_new_context_with_model: n_ctx      = 512
+...
+llama_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
+llama_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
+llama_new_context_with_model: graph nodes  = 1225
+llama_new_context_with_model: graph splits = 2
+ggml_debug:                 inp_embd = (f32)   GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.0181,   0.0272,   0.0272, ...],
+                                      ],
+                                     ]
+ggml_debug:                   norm-0 = (f32)       NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.6989,   1.0636,   1.0636, ...],
+                                      ],
+                                     ]
+ggml_debug:                 norm_w-0 = (f32)        MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.1800,   0.2817,   0.2632, ...],
+                                      ],
+                                     ]
+ggml_debug:              attn_norm-0 = (f32)        ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.1863,   0.2970,   0.2604, ...],
+                                      ],
+                                     ]
+ggml_debug:                   wqkv-0 = (f32)    MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1238,   1.2876,  -1.8086, ...],
+                                      ],
+                                     ]
+ggml_debug:                   bqkv-0 = (f32)        ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:            bqkv-0 (view) = (f32)       VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:                   Qcur-0 = (f32)       CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:        Qcur-0 (reshaped) = (f32)    RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                       [ -0.3608,   0.5076,  -1.8866, ...],
+                                       [  1.7643,   0.0273,  -2.1065, ...],
+                                       ...
+                                      ],
+                                     ]
+ggml_debug:                   Qcur-0 = (f32)       ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                       [ -0.3608,   0.5076,  -1.8866, ...],
+                                       [  1.7643,   0.0273,  -2.1065, ...],
+                                       ...
+                                      ],
+                                     ]
+```
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -0,0 +1,195 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <cstdio>
+#include <random>
+#include <string>
+#include <tuple>
+#include <vector>
+
+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+    std::vector<uint8_t> data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        printf("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                printf("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            printf("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    printf("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                printf("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        printf("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) data + i;
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) data + i;
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) data + i;
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) data + i;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    printf("%12.4f", v);
+                    sum += v;
+                    if (i0 < ne[0] - 1) printf(", ");
+                }
+                printf("],\n");
+            }
+            printf("                                      ],\n");
+        }
+        printf("                                     ]\n");
+        printf("                                     sum = %f\n", sum);
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true; // Always retrieve data
+    }
+
+    char src1_str[128] = {0};
+    if (src1) {
+        sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    }
+
+    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+           t->name, ggml_type_name(t->type), ggml_op_desc(t),
+           src0->name, ggml_ne_string(src0).c_str(),
+           src1 ? src1_str : "",
+           ggml_ne_string(t).c_str());
+
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type)) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
+static bool run(llama_context * ctx, const gpt_params & params) {
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return false;
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+
+    callback_data cb_data;
+
+    gpt_params params;
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    print_build_info();
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = ggml_debug;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+
+    // init
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+    }
+
+    bool OK = run(ctx, params);
+    if (!OK) {
+        return 1;
+    }
+
+    llama_print_timings(ctx);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
    size_t pos = 0;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
        auto prev_stacks = grammar->stacks;
-        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
        if (grammar->stacks.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@@ -5,5 +5,6 @@ CLI to split / merge GGUF files.
 **Command line options:**

 - `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
 - `--split-max-tensors`: maximum tensors in each split: default(128)
 - `--merge`: merge multiple GGUF to a single GGUF.
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024; // megabytes
+        n_bytes = (size_t)n * 1024 * 1024; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/gguf-split
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split with max tensors strategy
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 2b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 3. Merge
+$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
+echo PASS
+echo
+
+# 3b. Test the merged model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Split with no tensor in metadata
+#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+#echo PASS
+#echo
+
+# 4b. Test the sharded model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 5. Merge
+#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
+#echo PASS
+#echo
+
+# 5b. Test the merged model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 6. Split with size strategy
+$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
+echo PASS
+echo
+
+# 6b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
 }

 // read and create ggml_context containing the tensors and their data
-static bool gguf_ex_read_1(const std::string & fname) {
+static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
    struct ggml_context * ctx_data = NULL;

    struct gguf_init_params params = {
@@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
            printf("\n\n");

            // check data
-            {
+            if (check_data) {
                const float * data = (const float *) cur->data;
                for (int j = 0; j < ggml_nelements(cur); ++j) {
                    if (data[j] != 100 + i) {
@@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) {

 int main(int argc, char ** argv) {
    if (argc < 3) {
-        printf("usage: %s data.gguf r|w\n", argv[0]);
+        printf("usage: %s data.gguf r|w [n]\n", argv[0]);
+        printf("r: read data.gguf file\n");
+        printf("w: write data.gguf file\n");
+        printf("n: no check of tensor data\n");
        return -1;
    }
+    bool check_data = true;
+    if (argc == 4) {
+        check_data = false;
+    }

    const std::string fname(argv[1]);
    const std::string mode (argv[2]);
@@ -242,7 +249,7 @@ int main(int argc, char ** argv) {
        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
    } else if (mode == "r") {
        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
-        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file");
    }

    return 0;
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -21,12 +21,12 @@ not have to be performed at all.
 ### Running the example
 Download a Grit model:
 ```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
 ```

 Run the example using the downloaded model:
 ```console
-$ ./gritlm -m gritlm-7b_q4_1.gguf
+$ ./gritlm -m models/gritlm-7b_q4_1.gguf

 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -23,6 +23,7 @@ struct Stats {
 };

 struct StatParams {
+    std::string dataset;
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
@@ -44,9 +45,9 @@ private:
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
-    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
+    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
-    void save_imatrix(const char * file_name) const;
+    void save_imatrix(const char * file_name, const char * dataset) const;
    void keep_imatrix(int ncall) const;
 };

@@ -81,6 +82,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    if (ask) {
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
+        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
        return true;
@@ -101,16 +103,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
-        const int idx  = ((int32_t *) t->op_params)[0];
+        //   ids  -> [n_experts_used, n_tokens]
+        //   src1 -> [cols, n_expert_used, n_tokens]
        const ggml_tensor * ids = t->src[2];
        const int n_as = src0->ne[2];
+        const int n_ids = ids->ne[0];

        // the top-k selected expert ids are stored in the ids tensor
        // for simplicity, always copy ids to host, because it is small
        // take into account that ids is not contiguous!
-        GGML_ASSERT(ids->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
-        m_ids.resize(ggml_nbytes(ids)/sizeof(int));
+
+        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+        m_ids.resize(ggml_nbytes(ids));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));

        auto & e = m_stats[wname];
@@ -120,26 +125,35 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        //       using the following line, we can correct for that if needed by replacing the line above with:
        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0]*n_as, 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            exit(1); //GGML_ASSERT(false);
+        }
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        }
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
-            if (e.values.empty()) {
-                e.values.resize(src1->ne[0]*n_as, 0);
-            }
-            else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-                exit(1); //GGML_ASSERT(false);
-            }
-            if (m_params.verbosity > 1) {
-                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-            }
-            for (int row = 0; row < (int)src1->ne[1]; ++row) {
-                const int excur = m_ids[row*n_as + idx];
-                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
-                if (excur != ex) continue;
-                const float * x = data + row * src1->ne[0];
-                for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                    e.values[e_start + j] += x[j]*x[j];
+
+            for (int idx = 0; idx < n_ids; ++idx) {
+                for (int row = 0; row < (int)src1->ne[2]; ++row) {
+                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
+
+                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+
+                    if (excur != ex) continue;
+
+                    const int64_t i11 = idx % src1->ne[1];
+                    const int64_t i12 = row;
+                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+
+                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j]*x[j];
+                    }
                }
            }
            if (e.ncall > m_last_call) {
@@ -186,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }

 void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
 }

 void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -194,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str());
+    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
 }

-void IMatrixCollector::save_imatrix(const char * fname) const {
+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
+    out.write((const char *) &n_entries, sizeof(n_entries));
+    for (const auto & p : m_stats) {
        int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
+        out.write((const char *) &len, sizeof(len));
        out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
+        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
        int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+        out.write((const char *) &nval, sizeof(nval));
+        if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
    }
+
+    // Write the number of call the matrix was computed with
+    out.write((const char *) &m_last_call, sizeof(m_last_call));
+
+    // Write the dataset name at the end of the file to later on specify it in quantize
+    int n_dataset = strlen(dataset);
+    out.write((const char *) &n_dataset, sizeof(n_dataset));
+    out.write(dataset, n_dataset);
+
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
    }
 }

@@ -349,12 +372,13 @@ static void process_logits(
 static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -533,6 +557,29 @@ int main(int argc, char ** argv) {
        }
    }

+    gpt_params params;
+    params.n_batch = 512;
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
+        return 1;
+    }
+
+    params.logits_all = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+    print_build_info();
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));

    if (!combine_files.empty()) {
@@ -571,49 +618,21 @@ int main(int argc, char ** argv) {
        }
    }

-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
-        return 1;
-    }
-
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model_params mparams = llama_model_params_from_gpt_params(params);
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    llama_context_params cparams = llama_context_params_from_gpt_params(params);
-
    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    cparams.cb_eval = ik_collect_imatrix;
-    cparams.cb_eval_user_data = NULL;
+    params.cb_eval = ik_collect_imatrix;
+    params.cb_eval_user_data = NULL;
+    params.warmup = false;

-    llama_context * ctx = llama_new_context_with_model(model, cparams);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+    // init
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }

--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi

 ### Example

+Download a model that supports infill, for example CodeLlama:
+```console
+scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+```
+
 ```bash
 ./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -239,6 +239,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s\n", get_system_info(params).c_str());
    }
    const bool add_bos = llama_should_add_bos_token(model);
+    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

    bool suff_rm_leading_spc = params.escape;
@@ -279,10 +280,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
@@ -585,7 +586,7 @@ int main(int argc, char ** argv) {

            // deal with eot token in infill mode
            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
-                if(is_interacting && !params.interactive_first) {
+                if (is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
@@ -650,8 +651,8 @@ int main(int argc, char ** argv) {
                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
-            // deal with end of text token in interactive mode
-            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            // deal with end of generation tokens in interactive mode
+            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -730,8 +731,8 @@ int main(int argc, char ** argv) {
            }
        }

-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+        // end of generation
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
            break;
        }

--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,37 +6,94 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union

+def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+    if not separator_rule:
+        if min_items == 0 and max_items == 1:
+            return f'{item_rule}?'
+        elif min_items == 1 and max_items is None:
+            return f'{item_rule}+'
+
+    result = ''
+
+    if min_items > 0:
+        if item_rule_is_literal and separator_rule is None:
+            result = '"' + (item_rule[1:-1] * min_items) + '"'
+        else:
+            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+
+    def opt_repetitions(up_to_n, prefix_with_sep=False):
+        '''
+            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
+            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
+            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
+        '''
+
+        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
+        if up_to_n == 0:
+            return ''
+        elif up_to_n == 1:
+            return f'({content})?'
+        elif separator_rule and not prefix_with_sep:
+            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
+        else:
+            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
+
+    if min_items > 0 and max_items != min_items:
+        result += ' '
+
+    if max_items is not None:
+        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
+    else:
+        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+
+        if min_items == 0 and separator_rule:
+            result = f'({item_rule} {item_operator}*)?'
+        else:
+            result += f'{item_operator}*'
+
+    return result
+
+
+class BuiltinRule:
+    def __init__(self, content: str, deps: list = None):
+        self.content = content
+        self.deps = deps or []
+
+_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
+
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'

 PRIMITIVE_RULES = {
-    'boolean': '("true" | "false") space',
-    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
-    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
-    'value'  : 'object | array | string | number | boolean',
-    'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
-    'array'  : '"[" space ( value ("," space value)* )? "]" space',
-    'uuid'   : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
-    'string': r''' "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space''',
-    'null': '"null" space',
+    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+    'null'         : BuiltinRule('"null" space', []),
 }
-OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']

 # TODO: support "uri", "email" string formats
-DATE_RULES = {
-    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
-    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
-    'date-time': 'date "T" time',
-    'date-string': '"\\"" date "\\"" space',
-    'time-string': '"\\"" time "\\"" space',
-    'date-time-string': '"\\"" date-time "\\"" space',
+STRING_FORMAT_RULES = {
+    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
 }

-RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
+DOTALL = '[\\U00000000-\\U0010FFFF]'
+DOT = '[^\\x0A\\x0D]'
+
+RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])

 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
 GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
 NON_LITERAL_SET = set('|.()[]{}*+?')
 ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')

-DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
-TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits

 class SchemaConverter:
    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -55,7 +110,9 @@ class SchemaConverter:
        self._allow_fetch = allow_fetch
        self._dotall = dotall
        self._raw_pattern = raw_pattern
-        self._rules = {'space': SPACE_RULE}
+        self._rules = {
+            'space': SPACE_RULE,
+        }
        self._refs = {}
        self._refs_being_resolved = set()

@@ -65,6 +122,29 @@ class SchemaConverter:
        )
        return f'"{escaped}"'

+    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
+        '''
+            not_literal('a') -> '[^a]'
+            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+        '''
+        assert len(literal) > 0, 'Empty literal not supported'
+        def recurse(i: int):
+            c = literal[i]
+            if maybe_escaped_underscores and c == '_':
+                yield f'[^{c}\\\\]'
+                yield ' | '
+                yield f'"\\\\"? "{c}"'
+            else:
+                yield f'[^{c}]'
+            if i < len(literal) - 1:
+                yield ' | '
+                yield self._format_literal(c)
+                yield ' ('
+                yield from recurse(i + 1)
+                yield ')?'
+
+        return ''.join(('(', *recurse(0), ')'))
+
    def _add_rule(self, name, rule):
        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
        if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -169,10 +249,10 @@ class SchemaConverter:

            def get_dot():
                if self._dotall:
-                    rule = '[\\U00000000-\\U0010FFFF]'
+                    rule = DOTALL
                else:
                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
-                    rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
+                    rule = DOT
                return self._add_rule(f'dot', rule)

            def join_seq():
@@ -246,26 +326,14 @@ class SchemaConverter:

                    (sub, sub_is_literal) = seq[-1]

-                    if min_times == 0 and max_times is None:
-                        seq[-1] = (f'{sub}*', False)
-                    elif min_times == 0 and max_times == 1:
-                        seq[-1] = (f'{sub}?', False)
-                    elif min_times == 1 and max_times is None:
-                        seq[-1] = (f'{sub}+', False)
-                    else:
-                        if not sub_is_literal:
-                            id = sub_rule_ids.get(sub)
-                            if id is None:
-                                id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
-                                sub_rule_ids[sub] = id
-                            sub = id
+                    if not sub_is_literal:
+                        id = sub_rule_ids.get(sub)
+                        if id is None:
+                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+                            sub_rule_ids[sub] = id
+                        sub = id

-                        seq[-1] = (
-                            ' '.join(
-                                ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
-                                ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
-                            False
-                        )
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
                else:
                    literal = ''
                    while i < length:
@@ -373,49 +441,47 @@ class SchemaConverter:
                    ' "]" space')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
-                list_item_operator = f'( "," space {item_rule_name} )'
-                successive_items = ""
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                if min_items > 0:
-                    successive_items = list_item_operator * (min_items - 1)
-                    min_items -= 1
-                if max_items is not None and max_items > min_items:
-                    successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
-                else:
-                    successive_items += list_item_operator + "*"
-                if min_items == 0:
-                    rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
-                else:
-                    rule = f'"[" space {item_rule_name} {successive_items} "]" space'
-                return self._add_rule(rule_name, rule)
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')

        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)

        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
-            return self._add_rule(
+            return self._add_primitive(
                'root' if rule_name == 'root' else schema_format,
                PRIMITIVE_RULES['uuid']
            )

-        elif schema_type in (None, 'string') and schema_format in DATE_RULES:
-            for t, r in DATE_RULES.items():
-                self._add_rule(t, r)
-            return schema_format + '-string'
+        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
+            prim_name = f'{schema_format}-string'
+            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
+
+        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
+            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+            min_len = schema.get('minLength', 0)
+            max_len = schema.get('maxLength')
+
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')

        elif (schema_type == 'object') or (len(schema) == 0):
-            for n in OBJECT_RULE_NAMES:
-                self._add_rule(n, PRIMITIVE_RULES[n])
-            return self._add_rule(rule_name, 'object')
+            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))

        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_rule(
-                'root' if rule_name == 'root' else schema_type,
-                PRIMITIVE_RULES[schema_type]
-            )
+            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
+
+    def _add_primitive(self, name: str, rule: BuiltinRule):
+        n = self._add_rule(name, rule.content)
+
+        for dep in rule.deps:
+            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
+            assert dep_rule, f'Rule {dep} not known'
+            if dep not in self._rules:
+                self._add_primitive(dep, dep_rule)
+        return n

    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
        prop_order = self._prop_order
@@ -437,7 +503,7 @@ class SchemaConverter:
            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
            prop_kv_rule_names["*"] = self._add_rule(
                f'{sub_name}-kv',
-                self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+                self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
            )
            optional_props.append("*")

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -174,6 +174,7 @@ struct cmd_params {
    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
    std::vector<bool> no_kv_offload;
+    std::vector<bool> flash_attn;
    std::vector<std::vector<float>> tensor_split;
    std::vector<bool> use_mmap;
    std::vector<bool> embeddings;
@@ -190,11 +191,12 @@ static const cmd_params cmd_params_defaults = {
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {get_num_physical_cores()},
+    /* n_threads     */ {get_math_cpu_count()},
    /* n_gpu_layers  */ {99},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
    /* no_kv_offload */ {false},
+    /* flash_attn    */ {false},
    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
    /* use_mmap      */ {true},
    /* embeddings    */ {false},
@@ -220,6 +222,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -fa, --flash-attn <0|1>             (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
    printf("  -embd, --embeddings <0|1>           (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
    printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
@@ -393,6 +396,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<bool>(argv[i], split_delim);
            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
+        } else if (arg == "-fa" || arg == "--flash-attn") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
        } else if (arg == "-mmp" || arg == "--mmap") {
            if (++i >= argc) {
                invalid_param = true;
@@ -477,6 +487,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
+    if (params.flash_attn.empty())   { params.flash_attn = cmd_params_defaults.flash_attn; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
    if (params.embeddings.empty())   { params.embeddings = cmd_params_defaults.embeddings; }
@@ -498,6 +509,7 @@ struct cmd_params_instance {
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
+    bool flash_attn;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
@@ -532,6 +544,7 @@ struct cmd_params_instance {
        cparams.type_k = type_k;
        cparams.type_v = type_v;
        cparams.offload_kqv = !no_kv_offload;
+        cparams.flash_attn = flash_attn;
        cparams.embeddings = embeddings;

        return cparams;
@@ -554,6 +567,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
    for (const auto & nkvo : params.no_kv_offload)
+    for (const auto & fa : params.flash_attn)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@@ -572,6 +586,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
@@ -596,6 +611,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
                /* .no_kv_offload= */ nkvo,
+                /* .flash_attn   = */ fa,
                /* .tensor_split = */ ts,
                /* .use_mmap     = */ mmp,
                /* .embeddings   = */ embd,
@@ -633,6 +649,7 @@ struct test {
    llama_split_mode split_mode;
    int main_gpu;
    bool no_kv_offload;
+    bool flash_attn;
    std::vector<float> tensor_split;
    bool use_mmap;
    bool embeddings;
@@ -657,6 +674,7 @@ struct test {
        split_mode = inst.split_mode;
        main_gpu = inst.main_gpu;
        no_kv_offload = inst.no_kv_offload;
+        flash_attn = inst.flash_attn;
        tensor_split = inst.tensor_split;
        use_mmap = inst.use_mmap;
        embeddings = inst.embeddings;
@@ -731,7 +749,7 @@ struct test {
            "n_batch", "n_ubatch",
            "n_threads", "type_k", "type_v",
            "n_gpu_layers", "split_mode",
-            "main_gpu", "no_kv_offload",
+            "main_gpu", "no_kv_offload", "flash_attn",
            "tensor_split", "use_mmap", "embeddings",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
@@ -753,7 +771,7 @@ struct test {
        }
        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
-            field == "use_mmap" || field == "embeddings") {
+            field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -787,7 +805,7 @@ struct test {
            std::to_string(n_batch), std::to_string(n_ubatch),
            std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), split_mode_str(split_mode),
-            std::to_string(main_gpu), std::to_string(no_kv_offload),
+            std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
            tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -955,6 +973,9 @@ struct markdown_printer : public printer {
        if (field == "no_kv_offload") {
            return "nkvo";
        }
+        if (field == "flash_attn") {
+            return "fa";
+        }
        if (field == "use_mmap") {
            return "mmap";
        }
@@ -1001,6 +1022,9 @@ struct markdown_printer : public printer {
        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
            fields.emplace_back("no_kv_offload");
        }
+        if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
+            fields.emplace_back("flash_attn");
+        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.emplace_back("tensor_split");
        }
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return env->NewStringUTF("");
    }

--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -158,7 +158,7 @@ actor LlamaContext {
            new_token_id = llama_sample_token_greedy(context, &candidates_p)
        }

-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
@@ -322,7 +322,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8)
+        let nTokens = llama_token_to_piece(model, token, result, 8, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +330,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:

 ## Model conversion

- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:

 ```sh
 git clone https://huggingface.co/mtgv/MobileVLM-1.7B
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:

 ## LLaVA 1.5

- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:

 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
+#include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -23,7 +24,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
-#include <iostream>
 #include <map>
 #include <regex>
 #include <stdexcept>
@@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
+#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -145,7 +146,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
    int i = gguf_find_key(ctx, key);
    if (i == -1) {
-        fprintf(stderr, "key %s not found in file\n", key);
+        LOG_TEE("key %s not found in file\n", key);
        throw std::runtime_error(format("Missing required key: %s", key));
    }

@@ -247,7 +248,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
-    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -265,7 +266,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -284,7 +285,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -425,6 +426,7 @@ struct clip_vision_model {
    // embeddings
    struct ggml_tensor * class_embedding;
    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * patch_bias;
    struct ggml_tensor * position_embeddings;

    struct ggml_tensor * pre_ln_w;
@@ -501,6 +503,11 @@ struct clip_ctx {
    bool use_gelu = false;
    int32_t ftype = 1;

+    bool has_class_embedding = true;
+    bool has_pre_norm = true;
+    bool has_post_norm = false;
+    bool has_patch_bias = false;
+
    struct gguf_context * ctx_gguf;
    struct ggml_context * ctx_data;

@@ -515,7 +522,7 @@ struct clip_ctx {

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
    }

@@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + 1;
+    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
@@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

+    if (ctx->has_patch_bias) {
+        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+    }
+
    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    struct ggml_tensor * embeddings = inp;
+    if (ctx->has_class_embedding) {
+        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+        embeddings = ggml_acc(ctx0, embeddings, inp,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    }
    ggml_set_name(embeddings, "embeddings");
    ggml_set_input(embeddings);

-    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-
-    embeddings = ggml_acc(ctx0, embeddings, inp,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
@@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

    // pre-layernorm
-    {
+    if (ctx->has_pre_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");

@@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = cur;
    }

+    // post-layernorm
+    if (ctx->has_post_norm) {
+        embeddings = ggml_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "post_ln");
+
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
+    }
+
    // llava projector
    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -879,21 +901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
-            printf("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
        }
-        printf("%s: description:  %s\n", __func__, description.c_str());
-        printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        printf("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        printf("%s: n_tensors:    %d\n", __func__, n_tensors);
-        printf("%s: n_kv:         %d\n", __func__, n_kv);
-        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        printf("\n");
+        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
+        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_TEE("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
-    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
@@ -904,7 +926,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            n_type[type]++;
        }

-        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -920,7 +942,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            replace_all(value, "\n", "\\n");

-            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }

        // print type counts
@@ -929,7 +951,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                continue;
            }

-            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
    }

@@ -944,7 +966,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            size_t tensor_size = ggml_nbytes(cur);
            model_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
@@ -971,18 +993,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
-    printf("%s: CLIP using CUDA backend\n", __func__);
+    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
 #endif

 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
-    printf("%s: CLIP using Metal backend\n", __func__);
+    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif


    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
-        printf("%s: CLIP using CPU backend\n", __func__);
+        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
    }

    // model size and capabilities
@@ -1006,15 +1028,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

        if (verbosity >= 1) {
-            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            printf("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);

    // load tensors
    {
@@ -1027,7 +1049,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        new_clip->ctx_data = ggml_init(params);
        if (!new_clip->ctx_data) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            LOG_TEE("%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1035,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        auto fin = std::ifstream(fname, std::ios::binary);
        if (!fin) {
-            printf("cannot open model file for loading tensors\n");
+            LOG_TEE("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1057,7 +1079,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
-                printf("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
@@ -1128,34 +1150,61 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        if (verbosity >= 2) {
-            printf("\n%s: vision model hparams\n", __func__);
-            printf("image_size         %d\n", hparams.image_size);
-            printf("patch_size         %d\n", hparams.patch_size);
-            printf("v_hidden_size      %d\n", hparams.hidden_size);
-            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
-            printf("v_projection_dim   %d\n", hparams.projection_dim);
-            printf("v_n_head           %d\n", hparams.n_head);
-            printf("v_n_layer          %d\n", hparams.n_layer);
-            printf("v_eps              %f\n", hparams.eps);
-            printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            printf("v_image_grid_pinpoints: ");
+            LOG_TEE("\n%s: vision model hparams\n", __func__);
+            LOG_TEE("image_size         %d\n", hparams.image_size);
+            LOG_TEE("patch_size         %d\n", hparams.patch_size);
+            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_TEE("v_n_head           %d\n", hparams.n_head);
+            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
+            LOG_TEE("v_eps              %f\n", hparams.eps);
+            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_TEE("v_image_grid_pinpoints: ");
            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                printf("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
            }
-            printf("\n");
-            printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_TEE("\n");
+            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);

        }

+        try {
+            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            new_clip->has_class_embedding = true;
+        } catch (const std::exception& e) {
+            new_clip->has_class_embedding = false;
+        }
+
+        try {
+            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+            new_clip->has_pre_norm = true;
+        } catch (std::exception & e) {
+            new_clip->has_pre_norm = false;
+        }
+
+        try {
+            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
+            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
+            new_clip->has_post_norm = true;
+        } catch (std::exception & e) {
+            new_clip->has_post_norm = false;
+        }
+
+        try {
+            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
+            new_clip->has_patch_bias = true;
+        } catch (std::exception & e) {
+            new_clip->has_patch_bias = false;
+        }
+
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
        } catch(const std::exception& e) {
-            fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
+            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }

        // LLaVA projection
@@ -1184,7 +1233,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            } catch (std::runtime_error & e) {  }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & e) {  }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@@ -1264,7 +1313,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
@@ -1304,7 +1353,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
+        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1316,7 +1365,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
    int nx, ny, nc;
    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
-        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
+        LOG_TEE("%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1325,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
 }

 // Linear interpolation between two points
-inline float lerp(float s, float e, float t) {
+inline float clip_lerp(float s, float e, float t) {
    return s + (e - s) * t;
 }
 // Bilinear resize function
@@ -1347,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
            float y_lerp = py - y_floor;

            for (int c = 0; c < 3; c++) {
-                float top = lerp(
+                float top = clip_lerp(
                    static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
                    static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
                    x_lerp
                );
-                float bottom = lerp(
+                float bottom = clip_lerp(
                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
                    x_lerp
                );
-                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
            }
        }
    }
@@ -1506,7 +1555,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -1545,7 +1594,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }
    auto & params = ctx->vision_model.hparams;
@@ -1622,7 +1671,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }

            for (size_t i = 0; i < patches.size(); i++) {
-                // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                clip_image_u8_free(patches[i]);
            }

@@ -1765,7 +1814,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {

 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -1777,7 +1826,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -1939,7 +1988,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@@ -1958,7 +2007,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                f32_data = (float *)conv_buf.data();
                break;
            default:
-                printf("Please use an input file in f32 or f16\n");
+                LOG_TEE("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
@@ -1985,7 +2034,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -2001,8 +2050,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
    }

    return true;
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "log.h"
 #include "common.h"
 #include "clip.h"
 #include "llava.h"
@@ -18,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -45,7 +46,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
@@ -73,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
    size_t img_base64_str_start, img_base64_str_end;
    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
        return NULL;
    }

@@ -87,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip

    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
    if (!embed) {
-        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
+        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
        return NULL;
    }

@@ -112,29 +113,29 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

-static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
+static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {

    // load and preprocess the image
    llava_image_embed * embed = NULL;
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
+            LOG_TEE("using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
        if (!embed) {
-            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
+            LOG_TEE("%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
            return NULL;
        }
    }
@@ -146,7 +147,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    int n_past = 0;

    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));

    std::string system_prompt, user_prompt;
    size_t image_pos = prompt.find("<image>");
@@ -154,18 +154,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        printf("system_prompt: %s\n", system_prompt.c_str());
+        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
-        printf("user_prompt: %s\n", user_prompt.c_str());
+        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    } else {
@@ -175,18 +175,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    }

-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
+    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);

    // generate the response

-    fprintf(stderr, "\n");
+    LOG_TEE("\n");

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
    std::string response = "";
@@ -207,8 +207,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    printf("\n");
 }

+static struct llama_model * llava_init(gpt_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);

-static struct llava_context * llava_init(gpt_params * params) {
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
    const char * clip_path = params->mmproj.c_str();

    auto prompt = params->prompt;
@@ -218,16 +231,6 @@ static struct llava_context * llava_init(gpt_params * params) {

    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return NULL;
-    }

    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -235,7 +238,7 @@ static struct llava_context * llava_init(gpt_params * params) {
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

@@ -258,6 +261,12 @@ static void llava_free(struct llava_context * ctx_llava) {
    llama_backend_free();
 }

+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
 int main(int argc, char ** argv) {
    ggml_time_init();

@@ -267,29 +276,43 @@ int main(int argc, char ** argv) {
        show_additional_info(argc, argv);
        return 1;
    }
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
-
-    auto ctx_llava = llava_init(&params);
-    if (ctx_llava == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
+    auto model = llava_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
        return 1;
    }

-    auto image_embed = load_image(ctx_llava, &params);
-    if (!image_embed) {
-        return 1;
+    for (auto & image : params.image) {
+        auto ctx_llava = llava_init_context(&params, model);
+
+        auto image_embed = load_image(ctx_llava, &params, image);
+        if (!image_embed) {
+            std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+            return 1;
+        }
+
+        // process the prompt
+        process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+        llama_print_timings(ctx_llava->ctx_llama);
+        llava_image_embed_free(image_embed);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
    }
+    llama_free_model(model);

-    // process the prompt
-    process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-    llama_print_timings(ctx_llava->ctx_llama);
-
-    llava_image_embed_free(image_embed);
-    llava_free(ctx_llava);
    return 0;
 }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -54,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -154,13 +154,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
        if (newline_tmp->buffer == NULL) {
-            printf("newline_tmp tensor buffer is NULL\n");
+            LOG_TEE("newline_tmp tensor buffer is NULL\n");
        }
        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
    } else {
        model.newline->data = newline_tmp->data;
        if (model.newline->data == NULL) {
-            printf("newline_tmp tensor data is NULL\n");
+            LOG_TEE("newline_tmp tensor data is NULL\n");
        }
    }

@@ -224,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    img_res_v.size = 0;
    img_res_v.data = nullptr;
    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
    }
@@ -239,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
        delete[] img_res_v.data;
        if (!encoded) {
-            fprintf(stderr, "Unable to encode image\n");
+            LOG_TEE("Unable to encode image\n");

            return false;
        }
@@ -252,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);

@@ -290,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
    }

-    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

-    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

    return true;
 }
@@ -305,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
        return false;
    }
    return true;
@@ -314,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
    if (!image_embd) {
-        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
        return false;
    }

    int n_img_pos;
    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
        free(image_embd);
        return false;
    }
@@ -340,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
@@ -352,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
-        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
        return NULL;
    }

@@ -361,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
    if (!image_embed_result) {
        clip_image_u8_free(img);
-        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
+        LOG_TEE("%s: coulnd't embed the image\n", __func__);
        return NULL;
    }

@@ -375,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
    auto file = fopen(path, "rb");
    if (file == NULL) {
-        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
        return false;
    }

@@ -385,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long

    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
    if (buffer == NULL) {
-        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
        perror("Memory allocation error");
        fclose(file);
        return false;
@@ -410,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
    if (!loaded) {
-        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
        return NULL;
    }

--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -64,13 +64,10 @@ int main(int argc, char ** argv) {
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // Tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
    std::vector<llama_token> inp;
    std::vector<llama_token> all;

-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);
    all = inp;

    const int max_context_size     = llama_n_ctx(ctx);
@@ -302,7 +299,7 @@ int main(int argc, char ** argv) {
                }
                fflush(stdout);

-                if (id == llama_token_eos(model)) {
+                if (llama_token_is_eog(model, id)) {
                    has_eos = true;
                }

--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -28,10 +28,8 @@ int main(int argc, char ** argv){
    GGML_ASSERT(model != nullptr);

    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);
    fprintf(stderr, "%s: tokenization done\n", __func__);


--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -30,15 +30,11 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);

    llama_ngram_cache ngram_cache_context;
    llama_ngram_cache ngram_cache_dynamic;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -38,15 +38,11 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    inp = ::llama_tokenize(ctx, params.prompt, true, true);

    llama_ngram_cache ngram_cache_context;
    llama_ngram_cache ngram_cache_dynamic;
@@ -144,7 +140,7 @@ int main(int argc, char ** argv){
                printf("%s", token_str.c_str());
            }

-            if (id == llama_token_eos(model)) {
+            if (llama_token_is_eog(model, id)) {
                has_eos = true;
            }

--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -17,11 +17,9 @@ In this case, CLBlast was already installed so the CMake package is referenced i
 ```cmd
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-cmake --install . --prefix C:/LlamaCPP
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake --build build --config Release
+cmake --install build --prefix C:/LlamaCPP
 ```

 ### Build main-cmake-pkg
@@ -29,9 +27,7 @@ cmake --install . --prefix C:/LlamaCPP

 ```cmd
 cd ..\examples\main-cmake-pkg
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-cmake --install . --prefix C:/MyLlamaApp
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake --build build --config Release
+cmake --install build --prefix C:/MyLlamaApp
 ```
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt

 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:

-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
@@ -304,13 +304,15 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

-### Grammars
+### Grammars & JSON schemas

 -   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.

+-   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
+
 ### Quantization

-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).

 ## Additional Options

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -235,17 +235,17 @@ int main(int argc, char ** argv) {
            // The file exists and is not empty
            session_tokens.resize(n_ctx);
            size_t n_token_count_out = 0;
-            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
-            llama_set_rng_seed(ctx, params.seed);
            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
        }
    }

    const bool add_bos = llama_should_add_bos_token(model);
+    GGML_ASSERT(llama_add_eos_token(model) != 1);
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
        if (params.chatml) {
            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
        }
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
@@ -277,10 +277,10 @@ int main(int argc, char ** argv) {
    if (ctx_guidance) {
        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
+        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());

        original_prompt_len = original_inp.size();
@@ -324,7 +324,7 @@ int main(int argc, char ** argv) {
            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());

    // if we will use the cache for the full prompt without reaching the end of the cache, force
-    // reevaluation of the last token token to recalculate the cached logits
+    // reevaluation of the last token to recalculate the cached logits
    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);

@@ -339,14 +339,14 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true);

    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());

    // chatml prefix & suffix
-    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
+    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);

    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
@@ -544,7 +544,7 @@ int main(int argc, char ** argv) {
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
@@ -693,7 +693,7 @@ int main(int argc, char ** argv) {
            // optionally save the session on first sample (for faster prompt loading next time)
            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                need_to_save_session = false;
-                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());

                LOG("saved session to %s\n", path_session.c_str());
            }
@@ -794,8 +794,8 @@ int main(int argc, char ** argv) {
                }
            }

-            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            // deal with end of generation tokens in interactive mode
+            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -919,8 +919,8 @@ int main(int argc, char ** argv) {
            }
        }

-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
+        // end of generation
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
@@ -935,7 +935,7 @@ int main(int argc, char ** argv) {

    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
-        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+        llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

    llama_print_timings(ctx);
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -359,7 +359,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(model) ||
+                        (llama_token_is_eog(model, id) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");

                break;
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -1,21 +1,132 @@
-# perplexity
+# Perplexity

-TODO
+The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus.
+Perplexity measures how well the model can predict the next token with lower values being better.
+Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers.
+Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases.

-## Llama 2 70B Scorechart
-Quantization | Model size (GiB) | Perplexity | Delta to fp16
-- | -- | -- | --
-Q4_0 | 36.20 | 3.5550 | 3.61%
-Q4_1 | 40.20 | 3.5125 | 2.37%
-Q5_0 | 44.20 | 3.4744 | 1.26%
-Q2_K | 27.27 | 3.7339 | 8.82%
-Q3_K_S | 27.86 | 3.7019 | 7.89%
-Q3_K_M | 30.83 | 3.5932 | 4.72%
-Q3_K_L | 33.67 | 3.5617 | 3.80%
-Q4_K_S | 36.39 | 3.4852 | 1.57%
-Q4_K_M | 38.54 | 3.4725 | 1.20%
-Q5_K_S | 44.20 | 3.4483 | 0.50%
-Q5_K_M | 45.41 | 3.4451 | 0.40%
-Q6_K | 52.70 | 3.4367 | 0.16%
-fp16 | 128.5 | 3.4313 | -
+Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16.
+The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`).

+By default only the mean perplexity value and the corresponding uncertainty is calculated.
+The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation.
+
+More statistics can be obtained by recording the logits from the FP16 version of a model.
+To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`.
+The program will then record all logits and save them to the provided path in binary format.
+**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.**
+Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`,
+and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence.
+This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same.
+The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution.
+
+In addition to the KL divergence the following statistics are calculated with `--kl-divergence`:
+
+* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
+* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
+* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
+* Pearson correlation coefficient of the "correct" token probabilites between models.
+* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
+* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 .
+* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
+
+## LLaMA 3 8b Scoreboard
+
+Results are sorted by Kullback-Leibler divergence relative to FP16.
+The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
+
+| Quantization | imatrix | Model size [GiB] | PPL                    | ΔPPL                   | KLD                   | Mean Δp           | RMS Δp           |
+|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
+| f16          | None    |            14.97 | 6.233160 ±   0.037828  | -                      | -                     | -                 | -                |
+| q8_0         | None    |             7.96 | 6.234284 ±   0.037878  | 0.002650 ±   0.001006  | 0.001355 ±   0.000006 | -0.019 ± 0.003 %  | 1.198 ± 0.007 %  |
+| q6_K         | None    |             6.14 | 6.253382 ±   0.038078  | 0.021748 ±   0.001852  | 0.005452 ±   0.000035 | -0.007 ± 0.006 %  | 2.295 ± 0.019 %  |
+| q5_K_M       | None    |             5.33 | 6.288607 ±   0.038338  | 0.056974 ±   0.002598  | 0.010762 ±   0.000079 | -0.114 ± 0.008 %  | 3.160 ± 0.031 %  |
+| q5_K_S       | None    |             5.21 | 6.336598 ±   0.038755  | 0.104964 ±   0.003331  | 0.016595 ±   0.000122 | -0.223 ± 0.010 %  | 3.918 ± 0.036 %  |
+| q5_1         | None    |             5.65 | 6.337857 ±   0.038677  | 0.106223 ±   0.003476  | 0.018045 ±   0.000139 | -0.287 ± 0.011 %  | 4.123 ± 0.039 %  |
+| q5_0         | None    |             5.21 | 6.363224 ±   0.038861  | 0.131591 ±   0.003894  | 0.022239 ±   0.000166 | -0.416 ± 0.012 %  | 4.634 ± 0.043 %  |
+| q4_K_M       | WT 10m  |             4.58 | 6.382937 ±   0.039055  | 0.151303 ±   0.004429  | 0.028152 ±   0.000240 | -0.389 ± 0.014 %  | 5.251 ± 0.049 %  |
+| q4_K_M       | None    |             4.58 | 6.407115 ±   0.039119  | 0.175482 ±   0.004620  | 0.031273 ±   0.000238 | -0.596 ± 0.014 %  | 5.519 ± 0.050 %  |
+| q4_K_S       | WT 10m  |             4.37 | 6.409697 ±   0.039189  | 0.178064 ±   0.004744  | 0.031951 ±   0.000259 | -0.531 ± 0.015 %  | 5.645 ± 0.051 %  |
+| iq4_NL       | WT 10m  |             4.35 | 6.455593 ±   0.039630  | 0.223959 ±   0.005201  | 0.035742 ±   0.000288 | -0.590 ± 0.016 %  | 5.998 ± 0.054 %  |
+| iq4_XS       | WT 10m  |             4.14 | 6.459705 ±   0.039595  | 0.228071 ±   0.005207  | 0.036334 ±   0.000284 | -0.668 ± 0.016 %  | 6.044 ± 0.054 %  |
+| q4_K_S       | None    |             4.37 | 6.500529 ±   0.039778  | 0.268895 ±   0.005638  | 0.043136 ±   0.000314 | -0.927 ± 0.017 %  | 6.562 ± 0.055 %  |
+| q4_1         | None    |             4.78 | 6.682737 ±   0.041285  | 0.451103 ±   0.008030  | 0.071683 ±   0.000505 | -0.927 ± 0.017 %  | 8.512 ± 0.063 %  |
+| q4_0         | None    |             4.34 | 6.700147 ±   0.041226  | 0.468514 ±   0.007951  | 0.071940 ±   0.000491 | -1.588 ± 0.022 %  | 8.434 ± 0.061 %  |
+| q3_K_L       | WT 10m  |             4.03 | 6.671223 ±   0.041427  | 0.439590 ±   0.008154  | 0.073077 ±   0.000529 | -0.940 ± 0.023 %  | 8.662 ± 0.064 %  |
+| q3_K_M       | WT 10m  |             3.74 | 6.734255 ±   0.041838  | 0.502622 ±   0.008901  | 0.084358 ±   0.000588 | -1.198 ± 0.024 %  | 9.292 ± 0.065 %  |
+| q3_K_L       | None    |             4.03 | 6.787876 ±   0.042104  | 0.556242 ±   0.009171  | 0.087176 ±   0.000614 | -1.532 ± 0.025 %  | 9.432 ± 0.067 %  |
+| q3_K_M       | None    |             3.74 | 6.888498 ±   0.042669  | 0.656864 ±   0.010071  | 0.101913 ±   0.000677 | -1.990 ± 0.026 %  | 10.203 ± 0.068 % |
+| iq3_M        | WT 10m  |             3.53 | 6.898327 ±   0.041643  | 0.666694 ±   0.009449  | 0.102534 ±   0.000663 | -3.178 ± 0.026 %  | 10.513 ± 0.066 % |
+| iq3_S        | WT 10m  |             3.42 | 6.965501 ±   0.042406  | 0.733867 ±   0.010245  | 0.111278 ±   0.000710 | -3.066 ± 0.027 %  | 10.845 ± 0.068 % |
+| iq3_XS       | WT 10m  |             3.28 | 7.163043 ±   0.043772  | 0.931409 ±   0.012084  | 0.138693 ±   0.000857 | -3.667 ± 0.031 %  | 12.148 ± 0.070 % |
+| iq3_XXS      | WT 10m  |             3.05 | 7.458436 ±   0.046404  | 1.226803 ±   0.015234  | 0.183625 ±   0.001042 | -3.918 ± 0.035 %  | 13.836 ± 0.074 % |
+| q3_K_S       | WT 10m  |             3.41 | 7.602878 ±   0.046848  | 1.371244 ±   0.015688  | 0.199821 ±   0.001008 | -5.046 ± 0.037 %  | 14.980 ± 0.070 % |
+| q3_K_S       | None    |             3.41 | 7.863786 ±   0.048885  | 1.632152 ±   0.017733  | 0.228217 ±   0.001079 | -5.604 ± 0.038 %  | 15.541 ± 0.070 % |
+| iq2_M        | WT 10m  |             2.74 | 8.600799 ±   0.055124  | 2.369166 ±   0.025244  | 0.325989 ±   0.00160  | -6.463 ± 0.046 %  | 18.519 ± 0.080 % |
+| q2_K         | WT 10k  |             2.96 | 8.652290 ±   0.055572  | 2.420657 ±   0.025587  | 0.331393 ±   0.001562 | -6.606 ± 0.046 %  | 18.790 ± 0.078 % |
+| q2_K         | WT 100k |             2.96 | 8.641993 ±   0.055406  | 2.410359 ±   0.025495  | 0.331672 ±   0.001569 | -6.628 ± 0.047 %  | 18.856 ± 0.078 % |
+| q2_K         | WT 10m  |             2.96 | 8.647825 ±   0.055610  | 2.416191 ±   0.025683  | 0.332223 ±   0.001572 | -6.500 ± 0.047 %  | 18.881 ± 0.078 % |
+| q2_K         | WT 1m   |             2.96 | 8.674365 ±   0.055743  | 2.442732 ±   0.025843  | 0.335308 ±   0.001576 | -6.634 ± 0.047 %  | 19.009 ± 0.079 % |
+| q2_K         | WT 1k   |             2.96 | 8.682605 ±   0.055916  | 2.450972 ±   0.026069  | 0.337093 ±   0.001596 | -6.596 ± 0.047 %  | 18.977 ± 0.079 % |
+| q2_K_S       | WT 10m  |             2.96 | 9.323778 ±   0.061551  | 3.092145 ±   0.031914  | 0.403360 ±   0.001787 | -7.131 ± 0.049 %  | 20.050 ± 0.081 % |
+| q2_K_S       | WT 1m   |             2.96 | 9.329321 ±   0.061378  | 3.097688 ±   0.031816  | 0.403590 ±   0.001797 | -7.289 ± 0.049 %  | 20.123 ± 0.081 % |
+| q2_K_S       | WT 100k |             2.96 | 9.362973 ±   0.061740  | 3.131339 ±   0.032169  | 0.408367 ±   0.001802 | -7.198 ± 0.050 %  | 20.132 ± 0.081 % |
+| q2_K_S       | WT 10k  |             2.96 | 9.376479 ±   0.062045  | 3.144846 ±   0.032464  | 0.408662 ±   0.001819 | -7.141 ± 0.050 %  | 20.120 ± 0.081 % |
+| q2_K_S       | WT 1k   |             2.96 | 9.415200 ±   0.062475  | 3.183567 ±   0.032993  | 0.415865 ±   0.001846 | -7.153 ± 0.050 %  | 20.311 ± 0.082 % |
+| iq2_S        | WT 10m  |             2.56 | 9.650781 ±   0.063209  | 3.419148 ±   0.034017  | 0.439197 ±   0.001976 | -8.319 ± 0.052 %  | 21.491 ± 0.083 % |
+| q2_K         | None    |             2.96 | 9.751568 ±   0.063312  | 3.519934 ±   0.033863  | 0.445132 ±   0.001835 | -9.123 ± 0.051 %  | 21.421 ± 0.079 % |
+| iq2_XS       | WT 10m  |             2.43 | 10.761424 ±   0.071056 | 4.529791 ±   0.042229  | 0.546290 ±   0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % |
+| iq2_XXS      | WT 10m  |             2.24 | 14.091782 ±   0.098396 | 7.860148 ±   0.070752  | 0.812022 ±   0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % |
+| iq1_M        | WT 10m  |             2.01 | 25.493722 ±   0.177903 | 19.262089 ±   0.152396 | 1.393084 ±   0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % |
+| iq1_S        | WT 1m   |             1.88 | 58.097760 ±   0.438604 | 51.866126 ±   0.416604 | 2.211278 ±   0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % |
+| iq1_S        | WT 1k   |             1.88 | 58.267851 ±   0.446208 | 52.036218 ±   0.424373 | 2.214858 ±   0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % |
+| iq1_S        | WT 100k |             1.88 | 58.581498 ±   0.453145 | 52.349864 ±   0.431360 | 2.220834 ±   0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % |
+| iq1_S        | WT 10m  |             1.88 | 60.694593 ±   0.471290 | 54.462959 ±   0.449644 | 2.254554 ±   0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % |
+| iq1_S        | WT 10k  |             1.88 | 63.221324 ±   0.493077 | 56.989691 ±   0.471423 | 2.293527 ±   0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % |
+
+There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix.
+K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest.
+
+## LLaMA 2 vs. LLaMA 3 Quantization comparison
+
+| Metric          |          L2 7b q2_K |          L3 8b q2_K |        L2 7b q4_K_M |        L3 8b q4_K_M |          L2 7b q6_K |          L3 8b q6_K |          L2 7b q8_0 |          L3 8b q8_0 |
+|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|
+| Mean PPL        | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 |
+| Mean PPL ratio  | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 |
+| Mean ΔPPL       | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 |
+| PPL correlation |              97.36% |              89.62% |              99.71% |              99.34% |              99.94% |              99.88% |              99.98% |              99.96% |
+| Mean KLD        | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 |
+| Mean Δp         |    -2.710 ± 0.023 % |    -9.123 ± 0.051 % |    -0.416 ± 0.008 % |    -0.596 ± 0.014 % |    -0.035 ± 0.003 % |    -0.007 ± 0.006 % |    -0.005 ± 0.002 % |    -0.019 ± 0.003 % |
+| Maximum Δp      |             85.136% |             94.268% |             45.209% |             95.054% |             23.593% |             53.601% |             43.925% |             28.734% |
+| 99.9% Δp        |             37.184% |             50.003% |             17.461% |             27.084% |              7.798% |             13.613% |              3.387% |              6.402% |
+| 99.0% Δp        |             18.131% |             25.875% |              7.798% |             12.084% |              3.838% |              6.407% |              1.867% |              3.544% |
+| Median Δp       |             -0.391% |             -2.476% |             -0.026% |             -0.024% |             -0.001% |              0.000% |             -0.000% |             -0.000% |
+| 1.0% Δp         |            -39.762% |            -87.173% |            -11.433% |            -19.567% |             -4.222% |             -6.767% |             -1.862% |             -3.698% |
+| 0.1% Δp         |            -79.002% |            -98.897% |            -26.433% |            -56.054% |             -9.091% |            -16.584% |             -3.252% |             -6.579% |
+| Minimum Δp      |            -99.915% |            -99.965% |            -83.383% |            -98.699% |            -43.142% |            -68.487% |             -9.343% |            -24.301% |
+| RMS Δp          |     9.762 ± 0.053 % |    21.421 ± 0.079 % |     3.252 ± 0.024 % |     5.519 ± 0.050 % |     1.339 ± 0.010 % |     2.295 ± 0.019 % |     0.618 ± 0.011 % |     1.198 ± 0.007 % |
+| Same top p      |    85.584 ± 0.086 % |    71.138 ± 0.119 % |    94.665 ± 0.055 % |    91.901 ± 0.072 % |    97.520 ± 0.038 % |    96.031 ± 0.051 % |    98.846 ± 0.026 % |    97.674 ± 0.040 % |
+
+
+## Old Numbers
+
+<details>
+<summary>Llama 2 70B Scoreboard</summary>
+
+| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
+|--------------|------------------|------------|---------------|
+| Q4_0         | 36.20            | 3.5550     | 3.61%         |
+| Q4_1         | 40.20            | 3.5125     | 2.37%         |
+| Q5_0         | 44.20            | 3.4744     | 1.26%         |
+| Q2_K         | 27.27            | 3.7339     | 8.82%         |
+| Q3_K_S       | 27.86            | 3.7019     | 7.89%         |
+| Q3_K_M       | 30.83            | 3.5932     | 4.72%         |
+| Q3_K_L       | 33.67            | 3.5617     | 3.80%         |
+| Q4_K_S       | 36.39            | 3.4852     | 1.57%         |
+| Q4_K_M       | 38.54            | 3.4725     | 1.20%         |
+| Q5_K_S       | 44.20            | 3.4483     | 0.50%         |
+| Q5_K_M       | 45.41            | 3.4451     | 0.40%         |
+| Q6_K         | 52.70            | 3.4367     | 0.16%         |
+| fp16         | 128.5            | 3.4313     | -             |
+
+</details>
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -216,17 +216,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
 }

 struct kl_divergence_result {
-    double sum_nll  = 0;
-    double sum_nll2 = 0;
-    double sum_kld  = 0;
-    double sum_kld2 = 0;
-    double sum_nll_diff  = 0;
-    double sum_nll_diff2 = 0;
-    size_t n_same_top = 0;
-    size_t count = 0;
+    double sum_nll          = 0.0;
+    double sum_nll2         = 0.0;
+    double sum_nll_base     = 0.0;
+    double sum_nll_base2    = 0.0;
+    double sum_nll_nll_base = 0.0;
+    double sum_kld          = 0.0;
+    double sum_kld2         = 0.0;
+    double sum_p_diff       = 0.0;
+    double sum_p_diff2      = 0.0;
+    double sum_p_diff4      = 0.0;
+    float  max_p_diff       = 0.0f;
+    size_t n_same_top       = 0.0;
+    size_t count            = 0.0;
 };

-static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
+static std::pair<double, float> log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
    float max_logit = logits[0];
    int imax = 0;
    for (int i = 1; i < n_vocab; ++i) {
@@ -244,12 +249,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
    const float scale = d[0];
    const float min_log_prob = d[1];
    base_log_prob += 4;
-    float nll = max_logit + log_sum_exp - logits[tok];
+
+    const float nll = max_logit + log_sum_exp - logits[tok];
    kld.sum_nll  += nll;
    kld.sum_nll2 += nll*nll;
-    nll += (scale*base_log_prob[tok] + min_log_prob);
-    kld.sum_nll_diff  += nll;
-    kld.sum_nll_diff2 += nll*nll;
+
+    const float nll_base = -(scale*base_log_prob[tok] + min_log_prob);
+    kld.sum_nll_base  += nll_base;
+    kld.sum_nll_base2 += nll_base*nll_base;
+
+    kld.sum_nll_nll_base += nll*nll_base;
+
    max_logit += log_sum_exp;
    double sum = 0;
    int imax_base = -1;
@@ -269,34 +279,50 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba
    kld.sum_kld2 += sum*sum;
    ++kld.count;
    if (imax == imax_base) ++kld.n_same_top;
-    return sum;
+
+    const float p_base = expf(-nll_base);
+    const float p = expf(-nll);
+    const float p_diff = p - p_base;
+    kld.sum_p_diff  += p_diff;
+    const double p_diff2 = p_diff*p_diff;
+    kld.sum_p_diff2 += p_diff2;
+    kld.sum_p_diff4 += p_diff2*p_diff2;
+    kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff));
+
+    return std::make_pair(sum, p_diff);
 }

 static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
-        float * kld_values) {
+        float * kld_values, float * p_diff_values) {
    std::mutex mutex;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    int counter = 0;
-    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
+    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () {
        kl_divergence_result local_kld;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
-                kld.sum_nll  += local_kld.sum_nll;
-                kld.sum_nll2 += local_kld.sum_nll2;
-                kld.sum_kld  += local_kld.sum_kld;
-                kld.sum_kld2 += local_kld.sum_kld2;
-                kld.sum_nll_diff  += local_kld.sum_nll_diff;
-                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
-                kld.n_same_top += local_kld.n_same_top;
-                kld.count += local_kld.count;
+                kld.sum_nll          += local_kld.sum_nll;
+                kld.sum_nll2         += local_kld.sum_nll2;
+                kld.sum_nll_base     += local_kld.sum_nll_base;
+                kld.sum_nll_base2    += local_kld.sum_nll_base2;
+                kld.sum_nll_nll_base += local_kld.sum_nll_nll_base;
+                kld.sum_kld          += local_kld.sum_kld;
+                kld.sum_kld2         += local_kld.sum_kld2;
+                kld.sum_p_diff       += local_kld.sum_p_diff;
+                kld.sum_p_diff2      += local_kld.sum_p_diff2;
+                kld.sum_p_diff4      += local_kld.sum_p_diff4;
+                kld.n_same_top       += local_kld.n_same_top;
+                kld.max_p_diff        = std::max(kld.max_p_diff, local_kld.max_p_diff);
+                kld.count            += local_kld.count;
                break;
            }
            lock.unlock();
-            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
-            kld_values[i] = (float)v;
+            std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
+            kld_values[i]    = (float)v.first;
+            p_diff_values[i] = v.second;
        }
    };
    for (auto & w : workers) {
@@ -315,10 +341,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    // BOS tokens will be added for each chunk before eval

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);

@@ -454,6 +481,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // BOS tokens will be added for each chunk before eval

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@@ -470,7 +498,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
@@ -771,9 +799,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
    // The tasks should be randomized so the score stabilizes quickly.
    bool randomize_tasks = true;

@@ -818,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
        for (size_t j = 0; j < 4; j++) {
            hs_cur.ending[j] = prompt_lines[idx*6+2+j];
-            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos);
+            hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
        }

        // determine the common prefix of the endings
@@ -837,7 +862,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            hs_cur.seq_tokens[2].size() - hs_cur.common_prefix +
            hs_cur.seq_tokens[3].size() - hs_cur.common_prefix;

-        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size());
+        //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size());

        // Delete the selected random example from the prompt
        if (randomize_tasks) {
@@ -1110,12 +1135,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);

-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
    for (auto & task : data) {
-        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos);
-        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos);
+        task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
+        task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);

        task.common_prefix = 0;
        for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
@@ -1130,8 +1152,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            task.seq_tokens[0].size() - task.common_prefix +
            task.seq_tokens[1].size() - task.common_prefix;

-        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size();
-        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size();
+        task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
+        task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
    }

    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
@@ -1322,7 +1344,7 @@ struct multiple_choice_task {
    std::vector<float> log_probs;
 };

-static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
+static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
    if (task.question.empty() || task.mc1.answers.empty()) {
        if (log_error) {
            printf("%s: found bad task with empty question and/or answers\n", __func__);
@@ -1337,7 +1359,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos,
            }
            return false;
        }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
+        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
    }
    auto min_len = task.seq_tokens.front().size();
    for (auto& seq : task.seq_tokens) {
@@ -1436,9 +1458,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        n_task = params.multiple_choice_tasks;
    }

-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
    printf("%s: preparing task data", __func__);
    fflush(stdout);
    if (n_task > 500) {
@@ -1446,7 +1465,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        fflush(stdout);
        std::atomic<int> counter(0);
        std::atomic<int> n_bad(0);
-        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
+        auto prepare = [&counter, &n_bad, &tasks, ctx] () {
            int num_tasks = tasks.size();
            int n_bad_local = 0;
            while (true) {
@@ -1457,7 +1476,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                }
                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
                for (int i = first; i < last; ++i) {
-                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
+                    if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local;
                }
            }
        };
@@ -1479,7 +1498,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
-            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
+            if (!multiple_choice_prepare_one_task(ctx, task, true)) {
                return;
            }
            if (i_task%n_dot == 0) {
@@ -1715,9 +1734,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);

    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
-    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float>    kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
    std::vector<float> logits;
    if (num_batches > 1) {
        logits.reserve(n_ctx * n_vocab);
@@ -1734,9 +1755,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
        return std::make_pair(f, df);
    };
+    auto covariance = [] (double suma, double sumb, double sumab, size_t count) {
+        if (count < 10) {
+            return 0.0;
+        }
+        double var = sumab/count - (suma/count)*(sumb/count);
+        var /= count - 1;
+        return var;
+    };

    kl_divergence_result kld;
-    auto kld_ptr = kld_values.data();
+    auto    kld_ptr =    kld_values.data();
+    auto p_diff_ptr = p_diff_values.data();

    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
@@ -1791,24 +1821,42 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
            }
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);

-            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
+            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
        }

        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                workers, log_probs_uint16, kld, kld_ptr);
-        kld_ptr += n_ctx - 1 - first;
+                workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
+        p_diff_ptr += n_ctx - 1 - first;
+        kld_ptr    += n_ctx - 1 - first;

-        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
-        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
-        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        auto p_top = 1.*kld.n_same_top/kld.count;
-        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
+        printf("%4d", i+1);

-        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
-                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
-                p_top, d_p_top);
+        auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+        const double ppl_val = exp(log_ppl.first);
+        const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
+        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+
+        auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+        const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+        const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+        const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+
+        auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
+        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+
+        auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+        const double p_diff_rms_val = sqrt(p_diff_mse.first);
+        const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+        double p_top_val = 1.*kld.n_same_top/kld.count;
+        double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
+        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+
+        printf("\n");

        fflush(stdout);

@@ -1819,31 +1867,97 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    if (kld.count < 100) return; // we do not wish to do statistics on so few values

    std::sort(kld_values.begin(), kld_values.end());
+    std::sort(p_diff_values.begin(), p_diff_values.end());

-    printf("===== KL-divergence statistics\n");
+    printf("====== Perplexity statistics ======\n");
+
+    auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
+    const double ppl_val = exp(log_ppl.first);
+    const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
+    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+
+    auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
+    const double ppl_base_val = exp(log_ppl_base.first);
+    const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
+    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+
+    const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
+    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
+    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+
+    const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
+    const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
+    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+
+    const double ppl_ratio_val = exp(log_ppl_ratio_val);
+    const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
+    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+
+    const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
+    const double ppl_diff_val = ppl_val - ppl_base_val;
+    const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
+    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+
+    printf("\n");
+
+    printf("====== KL divergence statistics ======\n");
    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
+    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                               : kld_values[kld_values.size()/2];
-    printf("Median : %10.6f\n", kld_median);

-    auto percentile = [&kld_values] (float fraction) {
-        if (fraction <= 0) return kld_values.front();
-        if (fraction >= 1) return kld_values.back();
-        float p = fraction*(kld_values.size() - 1);
+    auto percentile = [] (std::vector<float> values, float fraction) {
+        if (fraction <= 0) return values.front();
+        if (fraction >= 1) return values.back();
+        float p = fraction*(values.size() - 1);
        size_t ip = size_t(p); p -= ip;
-        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
+        return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
    };

-    printf("Maximum: %10.6f\n", kld_values.back());
-    printf("KLD_99 : %10.6f\n", percentile(0.99f));
-    printf("KLD_95 : %10.6f\n", percentile(0.95f));
-    printf("KLD_90 : %10.6f\n", percentile(0.90f));
+    printf("Maximum KLD: %10.6f\n", kld_values.back());
+    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    printf("Median  KLD: %10.6f\n", kld_median);
+    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    printf("Minimum KLD: %10.6f\n", kld_values.front());

-    printf("Minimum: %10.6f\n", kld_values.front());
-    printf("KLD_01 : %10.6f\n", percentile(0.01f));
-    printf("KLD_05 : %10.6f\n", percentile(0.05f));
-    printf("KLD_10 : %10.6f\n", percentile(0.10f));
+    printf("\n");
+
+    printf("====== Token probability statistics ======\n");
+
+    auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
+    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+
+    auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
+                                               : p_diff_values[p_diff_values.size()/2];
+
+    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+
+    auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
+    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+
+    const double p_diff_rms_val = sqrt(p_diff_mse.first);
+    const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
+    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+
+    const double same_top_p = 1.0*kld.n_same_top/kld.count;
+    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));

 }

@@ -1858,12 +1972,20 @@ int main(int argc, char ** argv) {

    const int32_t n_ctx = params.n_ctx;

+    if (n_ctx <= 0) {
+        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
+
    if (ppl) {
-        int n_seq = std::max(1, params.n_batch / n_ctx);
-        int32_t n_kv = n_seq * n_ctx;
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
        params.n_parallel = n_seq;
-        params.n_ctx = n_kv;
+        params.n_ctx      = n_kv;
+
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -23,7 +23,7 @@
 #endif

 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.gguf";
+    std::string model = DEFAULT_MODEL_PATH;
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -4,17 +4,17 @@ TODO

 ## Llama 2 7B

-Quantization | Bits per Weight (BPW)
-- | --
-Q2_K | 3.35
-Q3_K_S | 3.50
-Q3_K_M | 3.91
-Q3_K_L | 4.27
-Q4_K_S | 4.58
-Q4_K_M | 4.84
-Q5_K_S | 5.52
-Q5_K_M | 5.68
-Q6_K | 6.56
+| Quantization | Bits per Weight (BPW) |
+|--------------|-----------------------|
+| Q2_K         | 3.35                  |
+| Q3_K_S       | 3.50                  |
+| Q3_K_M       | 3.91                  |
+| Q3_K_L       | 4.27                  |
+| Q4_K_S       | 4.58                  |
+| Q4_K_M       | 4.84                  |
+| Q5_K_S       | 5.52                  |
+| Q5_K_M       | 5.68                  |
+| Q6_K         | 6.56                  |

 ## Llama 2 13B
 Quantization | Bits per Weight (BPW)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,7 +8,6 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
-#include <algorithm>

 struct quant_option {
    std::string name;
@@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };

+static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";

 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;
@@ -97,6 +100,7 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --keep-split: will generate quatized model in the same shards as input");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -112,7 +116,7 @@ static void usage(const char * executable) {
    exit(1);
 }

-static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -159,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
        }
    }
-    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
+
+    // latest imatrix version contains the dataset filename at the end of the file
+    int m_last_call = 0;
+    if (in.peek() != EOF) {
+        in.read((char *)&m_last_call, sizeof(m_last_call));
+        int dataset_len;
+        in.read((char *)&dataset_len, sizeof(dataset_len));
+        std::vector<char> dataset_as_vec(dataset_len);
+        in.read(dataset_as_vec.data(), dataset_len);
+        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
+        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+    }
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+    return m_last_call;
 }

-static void prepare_imatrix(const std::string & imatrix_file,
+static int prepare_imatrix(const std::string & imatrix_file,
+        std::string & imatrix_dataset,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+    int m_last_call = -1;
    if (!imatrix_file.empty()) {
-        load_imatrix(imatrix_file, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
    }
    if (imatrix_data.empty()) {
-        return;
+        return m_last_call;
    }
    if (!excluded_weights.empty()) {
        for (auto& name : excluded_weights) {
@@ -196,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
    if (!imatrix_data.empty()) {
        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
    }
+    return m_last_call;
 }

 static ggml_type parse_ggml_type(const char * arg) {
@@ -210,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
    return result;
 }

-static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
-    const char* sep = strchr(data, '=');
-    if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
-        return false;
-    }
-    llama_model_kv_override kvo;
-    std::strncpy(kvo.key, data, sep - data);
-    kvo.key[sep - data] = 0;
-    sep++;
-    if (strncmp(sep, "int:", 4) == 0) {
-        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-        kvo.int_value = std::atol(sep);
-    } else if (strncmp(sep, "float:", 6) == 0) {
-        sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-        kvo.float_value = std::atof(sep);
-    } else if (strncmp(sep, "bool:", 5) == 0) {
-        sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-        if (std::strcmp(sep, "true") == 0) {
-            kvo.bool_value = true;
-        } else if (std::strcmp(sep, "false") == 0) {
-            kvo.bool_value = false;
-        } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
-            return false;
-        }
-    } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
-        return false;
-    }
-    overrides.emplace_back(std::move(kvo));
-    return true;
-}
-
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -300,6 +283,8 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+            params.keep_split = true;
        } else {
            usage(argv[0]);
        }
@@ -313,10 +298,43 @@ int main(int argc, char ** argv) {
        usage(argv[0]);
    }

+    std::string imatrix_dataset;
    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+        if (!imatrix_dataset.empty()) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = imatrix_data.size();
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        if (m_last_call > 0) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = m_last_call;
+            kv_overrides.emplace_back(std::move(kvo));
+        }
    }
    if (!kv_overrides.empty()) {
        kv_overrides.emplace_back();
@@ -332,20 +350,28 @@ int main(int argc, char ** argv) {
    std::string fname_out;

    std::string ftype_str;
+    std::string suffix = ".gguf";
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
        const size_t pos = fname_inp.find_last_of("/\\");
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
        arg_idx++;
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
    } else {
        fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
        arg_idx++;

        if (argc <= arg_idx) {
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+QUANTIZE=$1/quantize
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/quantize
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep_split'
+$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep_split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@@ -8,7 +8,7 @@ print(subprocess.check_output(
        "python",
        os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
-        "json-schema-to-grammar.py"),
+        "json_schema_to_grammar.py"),
        *rest,
        "-",
        "--raw-pattern",
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -24,6 +24,7 @@ int main(int argc, char ** argv) {

    std::string result0;
    std::string result1;
+    std::string result2;

    // init
    llama_model * model;
@@ -44,8 +45,8 @@ int main(int argc, char ** argv) {

    // save state (rng, logits, embedding and kv_cache) to file
    {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
-        const size_t written = llama_copy_state_data(ctx, state_mem.data());
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
+        const size_t written = llama_state_get_data(ctx, state_mem.data());

        FILE *fp_write = fopen("dump_state.bin", "wb");
        fwrite(state_mem.data(), 1, written, fp_write);
@@ -97,13 +98,13 @@ int main(int argc, char ** argv) {

    // load state (rng, logits, embedding and kv_cache) from file
    {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx2));

        FILE * fp_read = fopen("dump_state.bin", "rb");
        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
        fclose(fp_read);

-        if (read != llama_set_state_data(ctx2, state_mem.data())) {
+        if (read != llama_state_set_data(ctx2, state_mem.data())) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@@ -141,16 +142,104 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

-    printf("\n");
+    printf("\n\n");

    llama_free(ctx2);
-    llama_free_model(model);

    if (result0 != result1) {
        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
        return 1;
    }

+    // make new context
+    auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+
+    printf("\nsingle seq run: %s", params.prompt.c_str());
+
+    // load state (rng, logits, embedding and kv_cache) from file
+    {
+        std::vector<uint8_t> state_mem(llama_state_get_size(ctx3));
+
+        FILE * fp_read = fopen("dump_state.bin", "rb");
+        const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
+        fclose(fp_read);
+
+        if (read != llama_state_set_data(ctx3, state_mem.data())) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+
+        fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size());
+    }
+
+    // restore state (last tokens)
+    n_past = n_past_saved;
+
+    // save seq 0 and load into seq 1
+    {
+        // save kv of seq 0
+        std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
+        const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0);
+        if (ncopy != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
+
+        // erase whole kv
+        llama_kv_cache_clear(ctx3);
+        fprintf(stderr, "%s : kv cache cleared\n", __func__);
+
+        // restore kv into seq 1
+        const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1);
+        if (nset != seq_store.size()) {
+            fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
+    }
+
+    // third run with seq 1 instead of 0
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto * logits = llama_get_logits(ctx3);
+        auto n_vocab = llama_n_vocab(model);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx3, &candidates_p);
+        auto next_token_str = llama_token_to_piece(ctx3, next_token);
+
+        printf("%s", next_token_str.c_str());
+        result2 += next_token_str;
+
+        if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            llama_free(ctx3);
+            llama_free_model(model);
+            return 1;
+        }
+        n_past += 1;
+    }
+
+    printf("\n");
+
+    llama_free(ctx3);
+    llama_free_model(model);
+
+    if (result0 != result2) {
+        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
+        return 1;
+    }
+
    fprintf(stderr, "\n%s : success\n", __func__);

    return 0;
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,17 +1,34 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET}
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+set(TARGET_SRCS
    server.cpp
    utils.hpp
    httplib.h
 )
+set(PUBLIC_ASSETS
+    index.html
+    index.js
+    completion.js
+    json-schema-to-grammar.mjs
+)
+foreach(asset ${PUBLIC_ASSETS})
+    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    list(APPEND TARGET_SRCS ${output})
+    add_custom_command(
+        DEPENDS "${input}"
+        OUTPUT "${output}"
+        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
+    )
+endforeach()
+add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -11,6 +11,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 * Continuous batching
 * Multimodal (wip)
 * Monitoring endpoints
+ * Schema-constrained JSON response format

 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).

@@ -57,6 +58,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1`
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 - `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled
+- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled.
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name.  Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
 - `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
@@ -72,15 +74,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - Using `make`:

  ```bash
-  make
+  make server
  ```

 - Using `CMake`:

  ```bash
-  cmake --build . --config Release
+  cmake -B build
+  cmake --build build --config Release -t server
  ```

+  Binary is at `./build/bin/server`
+
 ## Build with SSL

 `server` can also be built with SSL support using OpenSSL 3
@@ -97,10 +102,8 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
 - Using `CMake`:

  ```bash
-  mkdir build
-  cd build
-  cmake .. -DLLAMA_SERVER_SSL=ON
-  make server
+  cmake -B build -DLLAMA_SERVER_SSL=ON
+  cmake --build build --config Release -t server
  ```

 ## Quick Start
@@ -249,6 +252,8 @@ node index.js

    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar

+    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
+
    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.

    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
@@ -364,6 +369,8 @@ Notice that each `probs` is an array of length `n_probs`.

    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

+    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
+
    *Examples:*

    You can use either Python `openai` library with appropriate checkpoints:
@@ -517,6 +524,57 @@ Available metrics:
 - `llamacpp:requests_processing`: Number of requests processing.
 - `llamacpp:requests_deferred`: Number of requests deferred.

+- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
+
+    *Options:*
+
+    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
+
+### Result JSON
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_saved": 1745,
+    "n_written": 14309796,
+    "timings": {
+        "save_ms": 49.865
+    }
+}
+```
+
+- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
+
+    *Options:*
+
+    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
+
+### Result JSON
+
+```json
+{
+    "id_slot": 0,
+    "filename": "slot_save_file.bin",
+    "n_restored": 1745,
+    "n_read": 14309796,
+    "timings": {
+        "restore_ms": 42.937
+    }
+}
+```
+
+- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
+
+### Result JSON
+
+```json
+{
+    "id_slot": 0,
+    "n_erased": 1745
+}
+```
+
 ## More examples

 ### Change system prompt on runtime
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,13 +2,15 @@

 Benchmark is using [k6](https://k6.io/).

-##### Install k6
+##### Install k6 and sse extension

-Follow instruction from: https://k6.io/docs/get-started/installation/
+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.

-Example for ubuntu:
+Example:
 ```shell
-snap install k6
+go install go.k6.io/xk6/cmd/xk6@latest
+xk6 build master \
+--with github.com/phymbert/xk6-sse
 ```

 #### Download a dataset
@@ -46,7 +48,7 @@ server --host localhost --port 8080 \

 For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
 ```shell
-k6 run script.js --duration 10m --iterations 500 --vus 8
+./k6 run script.js --duration 10m --iterations 500 --vus 8
 ```

 The benchmark values can be overridden with:
@@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
 ```shell
 curl http://localhost:8080/metrics
 ```
+
+### Using the CI python script
+The `bench.py` script does several steps:
+- start the server
+- define good variable for k6
+- run k6 script
+- extract metrics from prometheus
+
+It aims to be used in the CI, but you can run it manually:
+
+```shell
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+              --runner-label local \
+              --name local \
+              --branch `git rev-parse --abbrev-ref HEAD` \
+              --commit `git rev-parse HEAD` \
+              --scenario script.js \
+              --duration 5m \
+              --hf-repo ggml-org/models	 \
+              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --model-path-prefix models \
+              --parallel 4 \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 4096 \
+              --n-prompts 200 \
+              --max-prompt-tokens 256 \
+              --max-tokens 256
+```
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
                            data['metrics'][metric_name][metric_metric]=value
                            github_env.write(
                                f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
-                token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
                iterations = data['root_group']['checks']['success completion']['passes']

    except Exception:
@@ -181,16 +180,16 @@ xychart-beta
    bench_results = {
        "i": iterations,
        "req": {
-            "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
+            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
            "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
        },
        "pp": {
-            "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
-            "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
+            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
            "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
        },
        "tg": {
-            "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
+            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
            "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
            "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
        },
@@ -206,7 +205,7 @@ xychart-beta


 def start_benchmark(args):
-    k6_path = 'k6'
+    k6_path = './k6'
    if 'BENCH_K6_BIN_PATH' in os.environ:
        k6_path = os.environ['BENCH_K6_BIN_PATH']
    k6_args = [
@@ -269,6 +268,7 @@ def start_server_background(args):
    server_args.extend(['--defrag-thold', "0.1"])
    server_args.append('--cont-batching')
    server_args.append('--metrics')
+    server_args.append('--flash-attn')
    server_args.extend(['--log-format', "text"])
    args = [str(arg) for arg in [server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,4 +1,4 @@
-import http from 'k6/http'
+import sse from 'k6/x/sse'
 import {check, sleep} from 'k6'
 import {SharedArray} from 'k6/data'
 import {Counter, Rate, Trend} from 'k6/metrics'
@@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {

 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')

 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -86,36 +88,63 @@ export default function () {
            }
        ],
        "model": model,
-        "stream": false,
+        "stream": true,
        "seed": 42,
-        "max_tokens": max_tokens
+        "max_tokens": max_tokens,
+        "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
    }

-    const body = JSON.stringify(payload)
+    const params = {method: 'POST', body: JSON.stringify(payload)};

-    let res = http.post(`${server_url}/chat/completions`, body, {
-        headers: {'Content-Type': 'application/json'},
-        timeout: '300s'
+    const startTime = new Date()
+    let promptEvalEndTime = null
+    let prompt_tokens = 0
+    let completions_tokens = 0
+    let finish_reason = null
+    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+        client.on('event', function (event) {
+            if (promptEvalEndTime == null) {
+                promptEvalEndTime = new Date()
+            }
+
+            let chunk = JSON.parse(event.data)
+            let choice = chunk.choices[0]
+            if (choice.finish_reason) {
+                finish_reason = choice.finish_reason
+            }
+
+            if (chunk.usage) {
+                prompt_tokens = chunk.usage.prompt_tokens
+                llamacpp_prompt_tokens.add(prompt_tokens)
+                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+                completions_tokens = chunk.usage.completion_tokens
+                llamacpp_completion_tokens.add(completions_tokens)
+                llamacpp_completion_tokens_total_counter.add(completions_tokens)
+            }
+        })
+
+        client.on('error', function (e) {
+            console.log('An unexpected error occurred: ', e.error());
+            throw e;
+        })
    })

    check(res, {'success completion': (r) => r.status === 200})

-    if (res.status === 200) {
-        const completions = res.json()
+    const endTime = new Date()

-        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
-        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
-
-        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
-        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
-
-        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
-        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
-
-        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
-    } else {
-        console.error(`response: ${res.body} request=${payload}`)
+    const promptEvalTime = promptEvalEndTime - startTime
+    if (promptEvalTime > 0) {
+        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
    }

+    const completion_time = endTime - promptEvalEndTime
+    if (completions_tokens > 0 && completion_time > 0) {
+        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+    }
+    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
    sleep(0.3)
 }
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -1,496 +0,0 @@
-unsigned char completion_js[] = {
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
-  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
-  0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
-  0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
-  0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
-  0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
-  0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
-  0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
-  0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
-  0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
-  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
-  0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
-  0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
-  0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
-  0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
-  0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
-  0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
-  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
-  0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
-  0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
-  0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
-  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d,
-  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f,
-  0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
-  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75,
-  0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
-  0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60, 0x24,
-  0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x60, 0x2c, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a,
-  0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e,
-  0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61,
-  0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c,
-  0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70,
-  0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70,
-  0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76,
-  0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65,
-  0x79, 0x20, 0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72,
-  0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42,
-  0x65, 0x61, 0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60,
-  0x7d, 0x20, 0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
-  0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20,
-  0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65,
-  0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e,
-  0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
-  0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54,
-  0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
-  0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42,
-  0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61,
-  0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
-  0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20,
-  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66,
-  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64,
-  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c,
-  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65,
-  0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72,
-  0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e,
-  0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
-  0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65,
-  0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74,
-  0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64,
-  0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65,
-  0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69,
-  0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69,
-  0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74,
-  0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
-  0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f,
-  0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65,
-  0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20,
-  0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20,
-  0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74,
-  0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e,
-  0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29,
-  0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63,
-  0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63,
-  0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a,
-  0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64,
-  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a,
-  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
-  0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
-  0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
-  0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73,
-  0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20,
-  0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
-  0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a,
-  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
-  0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
-  0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77,
-  0x20, 0x61, 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f,
-  0x20, 0x62, 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62,
-  0x79, 0x20, 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63,
-  0x61, 0x6c, 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72,
-  0x6f, 0x72, 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61,
-  0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24,
-  0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
-  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e,
-  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62,
-  0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22,
-  0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c,
-  0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72,
-  0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61,
-  0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61,
-  0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75,
-  0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
-  0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61,
-  0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65,
-  0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
-  0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74,
-  0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79,
-  0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
-  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c,
-  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64,
-  0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
-  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
-  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
-  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73,
-  0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e,
-  0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22,
-  0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
-  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
-  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
-  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
-  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43,
-  0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73,
-  0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f,
-  0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f,
-  0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
-  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69,
-  0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74,
-  0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
-  0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66,
-  0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61,
-  0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
-  0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
-  0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
-  0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65,
-  0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64,
-  0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20,
-  0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
-  0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63,
-  0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
-  0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61,
-  0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f,
-  0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64,
-  0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e,
-  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65,
-  0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61,
-  0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78,
-  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66,
-  0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x61, 0x70, 0x69,
-  0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, 0x3d, 0x20, 0x61,
-  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60,
-  0x24, 0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x60, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
-  0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28,
-  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e,
-  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
-  0x73, 0x3b, 0x0a, 0x7d, 0x0a
-};
-unsigned int completion_js_len = 5909;
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -8,13 +8,3 @@ PUBLIC=$DIR/public
 echo "download js bundle files"
 curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
 echo >> $PUBLIC/index.js # add newline
-
-FILES=$(ls $PUBLIC)
-
-cd $PUBLIC
-for FILE in $FILES; do
-  echo "generate $FILE.hpp"
-
-  # use simple flag for old version of xxd
-  xxd -i $FILE > $DIR/$FILE.hpp
-done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -406,7 +406,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: document.baseURI.replace(/\/+$/, '') })) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value, api_url: location.pathname.replace(/\/+$/, '') })) {
        const data = chunk.data;

        if (data.stop) {
@@ -881,11 +881,11 @@
        .replace(/&/g, '&amp;')
        .replace(/</g, '&lt;')
        .replace(/>/g, '&gt;')
-        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
-        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
-        .replace(/__(.*?)__/g, '<strong>$1</strong>')
-        .replace(/\*(.*?)\*/g, '<em>$1</em>')
-        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+        .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
        .replace(/`(.*?)`/g, '<code>$1</code>')
        .replace(/\n/gim, '<br />');
@@ -1015,6 +1015,10 @@
    }

    function App(props) {
+      useEffect(() => {
+        const query = new URLSearchParams(location.search).get("q");
+        if (query) chat(query);
+      }, []);

      return html`
        <div class="mode-${session.value.type}">
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -1,33 +1,95 @@
-// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
+// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
 const SPACE_RULE = '" "?';

+function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+  const separatorRule = opts.separatorRule ?? '';
+  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
+
+  if (separatorRule === '') {
+    if (minItems === 0 && maxItems === 1) {
+      return `${itemRule}?`;
+    } else if (minItems === 1 && maxItems === undefined) {
+      return `${itemRule}+`;
+    }
+  }
+
+  let result = '';
+  if (minItems > 0) {
+    if (itemRuleIsLiteral && separatorRule === '') {
+      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
+    } else {
+      result = Array.from({ length: minItems }, () => itemRule)
+        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
+    }
+  }
+
+  const optRepetitions = (upToN, prefixWithSep=false) => {
+    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
+    if (upToN === 0) {
+      return '';
+    } else if (upToN === 1) {
+      return `(${content})?`;
+    } else if (separatorRule !== '' && !prefixWithSep) {
+      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
+    } else {
+      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
+    }
+  };
+
+  if (minItems > 0 && maxItems !== minItems) {
+    result += ' ';
+  }
+
+  if (maxItems !== undefined) {
+    result += optRepetitions(maxItems - minItems, minItems > 0);
+  } else {
+    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
+
+    if (minItems === 0 && separatorRule !== '') {
+      result = `(${itemRule} ${itemOperator}*)?`;
+    } else {
+      result += `${itemOperator}*`;
+    }
+  }
+
+  return result;
+}
+
+class BuiltinRule {
+  constructor(content, deps) {
+    this.content = content;
+    this.deps = deps || [];
+  }
+}
+
+const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
+
 const PRIMITIVE_RULES = {
-  boolean: '("true" | "false") space',
-  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
-  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
-  value: 'object | array | string | number | boolean',
-  object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
-  array: '"[" space ( value ("," space value)* )? "]" space',
-  uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
-  string: ` "\\"" (
-        [^"\\\\] |
-        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\\"" space`,
-  null: '"null" space',
+  boolean        : new BuiltinRule('("true" | "false") space', []),
+  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
+  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
+  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
+  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
+  null           : new BuiltinRule('"null" space', []),
 };
-const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];

 // TODO: support "uri", "email" string formats
-const DATE_RULES = {
-    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
-    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
-    'date-time': 'date "T" time',
-    'date-string': '"\\"" date "\\"" space',
-    'time-string': '"\\"" time "\\"" space',
-    'date-time-string': '"\\"" date-time "\\"" space',
-};
+const STRING_FORMAT_RULES = {
+  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
+  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
+  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
+  'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+}

-const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};

 const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
 const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
@@ -158,7 +220,7 @@ export class SchemaConverter {
        rule = '[\\U00000000-\\U0010FFFF]';
      } else {
        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
-        rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
+        rule = '[^\\x0A\\x0D]';
      }
      return this._addRule('dot', rule);
    };
@@ -259,26 +321,19 @@ export class SchemaConverter {

          let [sub, subIsLiteral] = seq[seq.length - 1];

-          if (minTimes === 0 && maxTimes === Infinity) {
-            seq[seq.length - 1] = [`${sub}*`, false];
-          } else if (minTimes === 0 && maxTimes === 1) {
-            seq[seq.length - 1] = [`${sub}?`, false];
-          } else if (minTimes === 1 && maxTimes === Infinity) {
-            seq[seq.length - 1] = [`${sub}+`, false];
-          } else {
-            if (!subIsLiteral) {
-              let id = subRuleIds[sub];
-              if (id === undefined) {
-                id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
-                subRuleIds[sub] = id;
-              }
-              sub = id;
+          if (!subIsLiteral) {
+            let id = subRuleIds[sub];
+            if (id === undefined) {
+              id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+              subRuleIds[sub] = id;
            }
-
-            const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
-            const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
-            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
+            sub = id;
          }
+
+          seq[seq.length - 1] = [
+            _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
+            false
+          ];
        } else {
          let literal = '';
          while (i < length) {
@@ -394,49 +449,50 @@ export class SchemaConverter {
        );
      } else {
        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
-        const listItemOperator = `( "," space ${itemRuleName} )`;
-        let successiveItems = '';
-        let minItems = schema.minItems || 0;
+        const minItems = schema.minItems || 0;
        const maxItems = schema.maxItems;
-        if (minItems > 0) {
-          successiveItems = listItemOperator.repeat(minItems - 1);
-          minItems--;
-        }
-        if (maxItems !== undefined && maxItems > minItems) {
-          successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
-        } else {
-          successiveItems += `${listItemOperator}*`;
-        }
-        const rule = minItems === 0
-          ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
-          : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
-        return this._addRule(ruleName, rule);
+        return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
      }
    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
      return this._visitPattern(schema.pattern, ruleName);
    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
-      return this._addRule(
-          ruleName === 'root' ? 'root' : schemaFormat,
-          PRIMITIVE_RULES['uuid'])
-    } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
-      for (const [t, r] of Object.entries(DATE_RULES)) {
-        this._addRule(t, r);
-      }
-      return schemaFormat + '-string';
+      return this._addPrimitive(
+        ruleName === 'root' ? 'root' : schemaFormat,
+        PRIMITIVE_RULES['uuid']
+      );
+    } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
+      const primName = `${schema.format}-string`
+      return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
+    } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
+      const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+      const minLen = schema.minLength || 0;
+      const maxLen = schema.maxLength;
+      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
-      for (const n of OBJECT_RULE_NAMES) {
-        this._addRule(n, PRIMITIVE_RULES[n]);
-      }
-      return this._addRule(ruleName, 'object');
+      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
    } else {
      if (!(schemaType in PRIMITIVE_RULES)) {
        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
      }
      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-      return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+      return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
    }
  }

+  _addPrimitive(name, rule) {
+    let n = this._addRule(name, rule.content);
+    for (const dep of rule.deps) {
+      const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
+      if (!depRule) {
+        throw new Error(`Rule ${dep} not known`);
+      }
+      if (!(dep in this._rules)) {
+        this._addPrimitive(dep, depRule);
+      }
+    }
+    return n;
+  }
+
  _buildObjectRule(properties, required, name, additionalProperties) {
    const propOrder = this._propOrder;
    // sort by position in prop_order (if specified) then by original order
@@ -462,7 +518,7 @@ export class SchemaConverter {
      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
      propKvRuleNames['*'] = this._addRule(
        `${subName}-kv`,
-        `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
      optionalProps.push('*');
    }

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,10 @@ enum server_task_type {
    SERVER_TASK_TYPE_COMPLETION,
    SERVER_TASK_TYPE_CANCEL,
    SERVER_TASK_TYPE_NEXT_RESPONSE,
-    SERVER_TASK_TYPE_METRICS
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
 };

 struct server_task {
@@ -128,6 +131,7 @@ struct server_params {

    bool slots_endpoint   = true;
    bool metrics_endpoint = false;
+    std::string slot_save_path;
 };

 struct server_slot {
@@ -685,6 +689,7 @@ struct server_context {
        n_ctx = llama_n_ctx(ctx);

        add_bos_token = llama_should_add_bos_token(model);
+        GGML_ASSERT(llama_add_eos_token(model) != 1);

        return true;
    }
@@ -754,7 +759,7 @@ struct server_context {
        metrics.init();
    }

-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const {
+    std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
        // TODO: currently, we tokenize using special tokens by default
        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
        //       but it's better compared to completely ignoring ChatML and other chat templates
@@ -772,7 +777,7 @@ struct server_context {

                    std::vector<llama_token> p;
                    if (first) {
-                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
                        first = false;
                    } else {
                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
@@ -789,7 +794,7 @@ struct server_context {
            }
        } else {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
        }

        return prompt_tokens;
@@ -849,12 +854,12 @@ struct server_context {
        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
        slot.params.n_discard          = json_value(data, "n_discard",         default_params.n_discard);
-        slot.params.seed               = json_value(data, "seed",              default_params.seed);
+        slot.sparams.seed              = json_value(data, "seed",              default_sparams.seed);
        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && data.contains("grammar")) {
+        if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
        } else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1023,7 +1028,6 @@ struct server_context {
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
-            llama_set_rng_seed(ctx, slot.params.seed);
        }

        slot.command = SLOT_COMMAND_LOAD_PROMPT;
@@ -1054,7 +1058,7 @@ struct server_context {
        system_tokens.clear();

        if (!system_prompt.empty()) {
-            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = ::llama_tokenize(ctx, system_prompt, true);

            llama_batch_clear(batch);

@@ -1078,7 +1082,7 @@ struct server_context {
                };

                if (llama_decode(ctx, batch_view) != 0) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERROR("llama_decode() failed", {});
                    return;
                }
            }
@@ -1113,7 +1117,7 @@ struct server_context {

    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
        slot.sampled = result.tok;

        // search stop word and delete it
@@ -1196,13 +1200,34 @@ struct server_context {
            });
        }

-        if (result.tok == llama_token_eos(model)) {
+        if (llama_token_is_eog(model, result.tok)) {
            slot.stopped_eos    = true;
            slot.has_next_token = false;

            LOG_VERBOSE("eos token found", {});
        }

+        auto n_ctx_train = llama_n_ctx_train(model);
+        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
+                    && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
+            LOG_WARNING("n_predict is not set and self-context extend is disabled."
+                        " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
+                    { "id_slot",              slot.id },
+                    { "params.n_predict",     slot.params.n_predict },
+                    { "slot.n_prompt_tokens", slot.n_prompt_tokens },
+                    { "slot.n_decoded",       slot.n_decoded },
+                    { "slot.n_predict",       slot.n_predict },
+                    { "n_slots",              params.n_parallel },
+                    { "slot.n_ctx",           slot.n_ctx },
+                    { "n_ctx",                n_ctx },
+                    { "n_ctx_train",          n_ctx_train },
+                    { "ga_n",                 slot.ga_n },
+                });
+            slot.truncated      = true;
+            slot.stopped_limit  = true;
+            slot.has_next_token = false; // stop prediction
+        }
+
        LOG_VERBOSE("next token", {
            {"id_slot",        slot.id},
            {"id_task",        slot.id_task},
@@ -1276,7 +1301,11 @@ struct server_context {
    }

    void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
+        LOG_ERROR("task error", {
+            {"id_multi", id_multi},
+            {"id_task", id_task},
+            {"error", error},
+        });

        server_task_result res;
        res.id       = id_task;
@@ -1612,6 +1641,107 @@ struct server_context {
                    }
                    queue_results.send(res);
                } break;
+            case SERVER_TASK_TYPE_SLOT_SAVE:
+                {
+                    int id_slot = task.data["id_slot"];
+                    server_slot * slot = get_slot(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+
+                    const size_t token_count = slot->cache_tokens.size();
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.data["filename"];
+                    std::string filepath = task.data["filepath"];
+
+                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_save_ms = (t_end - t_start) / 1000.0;
+
+                    server_task_result result;
+                    result.id = task.id;
+                    result.stop = true;
+                    result.error = false;
+                    result.data = json {
+                        { "id_slot",   id_slot },
+                        { "filename",  filename },
+                        { "n_saved",   token_count }, // tokens saved
+                        { "n_written", nwrite },      // bytes written
+                        { "timings", {
+                            { "save_ms", t_save_ms }
+                        } }
+                    };
+                    queue_results.send(result);
+                } break;
+            case SERVER_TASK_TYPE_SLOT_RESTORE:
+                {
+                    int id_slot = task.data["id_slot"];
+                    server_slot * slot = get_slot(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+
+                    const int64_t t_start = ggml_time_us();
+
+                    std::string filename = task.data["filename"];
+                    std::string filepath = task.data["filepath"];
+
+                    slot->cache_tokens.resize(slot->n_ctx);
+                    size_t token_count = 0;
+                    size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
+                    if (nread == 0) {
+                        slot->cache_tokens.resize(0);
+                        send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+                    slot->cache_tokens.resize(token_count);
+
+                    const int64_t t_end = ggml_time_us();
+                    const double t_restore_ms = (t_end - t_start) / 1000.0;
+
+                    server_task_result result;
+                    result.id = task.id;
+                    result.stop = true;
+                    result.error = false;
+                    result.data = json {
+                        { "id_slot",    id_slot },
+                        { "filename",   filename },
+                        { "n_restored", token_count }, // tokens restored
+                        { "n_read",     nread },       // bytes read
+                        { "timings", {
+                            { "restore_ms", t_restore_ms }
+                        } }
+                    };
+                    queue_results.send(result);
+                } break;
+            case SERVER_TASK_TYPE_SLOT_ERASE:
+                {
+                    int id_slot = task.data["id_slot"];
+                    server_slot * slot = get_slot(id_slot);
+                    if (slot == nullptr) {
+                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
+                        break;
+                    }
+
+                    // Erase token cache
+                    const size_t n_erased = slot->cache_tokens.size();
+                    llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
+                    slot->cache_tokens.clear();
+
+                    server_task_result result;
+                    result.id = task.id;
+                    result.stop = true;
+                    result.error = false;
+                    result.data = json {
+                        { "id_slot",  id_slot },
+                        { "n_erased", n_erased }
+                    };
+                    queue_results.send(result);
+                } break;
        }
    }

@@ -1809,7 +1939,7 @@ struct server_context {
                            prefix_tokens.push_back(llama_token_middle(model));
                            prompt_tokens = prefix_tokens;
                        } else {
-                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
+                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }

                        slot.n_past = 0;
@@ -2032,7 +2162,7 @@ struct server_context {
        });

        // process the created batch of tokens
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);

            for (auto & slot : slots) {
@@ -2080,7 +2210,11 @@ struct server_context {
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
+                        {"i",   i},
+                        {"n_batch",  ret},
+                        {"ret",   ret},
+                    });
                    for (auto & slot : slots) {
                        slot.state = SLOT_STATE_PROCESSING;
                        slot.command = SLOT_COMMAND_NONE;
@@ -2090,12 +2224,16 @@ struct server_context {
                    break; // break loop of n_batch
                }

-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
-
                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
                i -= n_batch;

+                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
+                    {"i",   i},
+                    {"n_batch",  n_batch},
+                    {"ret",   ret},
+                });
+
                continue; // continue loop of n_batch
            }

@@ -2215,7 +2353,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
        printf("                            disable KV offload\n");
    }
    printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("                            model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
    printf("                            model download url (default: unused)\n");
    printf("  -hfr REPO, --hf-repo REPO\n");
@@ -2239,6 +2377,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
+    printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
    printf("  -spf FNAME, --system-prompt-file FNAME\n");
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
@@ -2249,11 +2388,12 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  --log-disable             disables logging to a file.\n");
    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
    printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
+    printf("  --slot-save-path PATH     path to save slot kv cache (default: disabled)\n");
    printf("\n");
    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                            types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
    printf("  --chat-template JINJA_TEMPLATE\n");
@@ -2603,6 +2743,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
            params.embedding = true;
        } else if (arg == "-cb" || arg == "--cont-batching") {
            params.cont_batching = true;
+        } else if (arg == "-fa" || arg == "--flash-attn") {
+            params.flash_attn = true;
        } else if (arg == "-np" || arg == "--parallel") {
            if (++i >= argc) {
                invalid_param = true;
@@ -2657,6 +2799,16 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
            sparams.slots_endpoint = false;
        } else if (arg == "--metrics") {
            sparams.metrics_endpoint = true;
+        } else if (arg == "--slot-save-path") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            sparams.slot_save_path = argv[i];
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                sparams.slot_save_path += DIRECTORY_SEPARATOR;
+            }
        } else if (arg == "--chat-template") {
            if (++i >= argc) {
                invalid_param = true;
@@ -2674,43 +2826,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
+            if (!parse_kv_override(argv[i], params.kv_overrides)) {
                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
            }
-            params.kv_overrides.push_back(kvo);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            server_print_usage(argv[0], default_params, default_sparams);
@@ -2718,6 +2838,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
        }
    }

+    gpt_params_handle_model_default(params);
+
    if (!params.kv_overrides.empty()) {
        params.kv_overrides.emplace_back();
        params.kv_overrides.back().key[0] = 0;
@@ -3159,6 +3281,112 @@ int main(int argc, char ** argv) {
        res.status = 200; // HTTP OK
    };

+    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+        json request_data = json::parse(req.body);
+        std::string filename = request_data["filename"];
+        if (!validate_file_name(filename)) {
+            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        std::string filepath = sparams.slot_save_path + filename;
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SLOT_SAVE;
+        task.data = {
+            { "id_slot", id_slot },
+            { "filename", filename },
+            { "filepath", filepath }
+        };
+
+        const int id_task = ctx_server.queue_tasks.post(task);
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        if (result.error) {
+            res_error(res, result.data);
+        } else {
+            res.set_content(result.data.dump(), "application/json");
+        }
+    };
+
+    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+        json request_data = json::parse(req.body);
+        std::string filename = request_data["filename"];
+        if (!validate_file_name(filename)) {
+            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+        std::string filepath = sparams.slot_save_path + filename;
+
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
+        task.data = {
+            { "id_slot", id_slot },
+            { "filename", filename },
+            { "filepath", filepath }
+        };
+
+        const int id_task = ctx_server.queue_tasks.post(task);
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        if (result.error) {
+            res_error(res, result.data);
+        } else {
+            res.set_content(result.data.dump(), "application/json");
+        }
+    };
+
+    const auto handle_slots_erase = [&ctx_server, &res_error](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
+        server_task task;
+        task.type = SERVER_TASK_TYPE_SLOT_ERASE;
+        task.data = {
+            { "id_slot", id_slot },
+        };
+
+        const int id_task = ctx_server.queue_tasks.post(task);
+        ctx_server.queue_results.add_waiting_task_id(id_task);
+
+        server_task_result result = ctx_server.queue_results.recv(id_task);
+        ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+        if (result.error) {
+            res_error(res, result.data);
+        } else {
+            res.set_content(result.data.dump(), "application/json");
+        }
+    };
+
+    const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+
+        std::string id_slot_str = req.path_params.at("id_slot");
+        int id_slot;
+
+        try {
+            id_slot = std::stoi(id_slot_str);
+        } catch (const std::exception &) {
+            res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
+            return;
+        }
+
+        std::string action = req.get_param_value("action");
+
+        if (action == "save") {
+            handle_slots_save(req, res, id_slot);
+        } else if (action == "restore") {
+            handle_slots_restore(req, res, id_slot);
+        } else if (action == "erase") {
+            handle_slots_erase(req, res, id_slot);
+        } else {
+            res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+        }
+    };
+
    const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json data = {
@@ -3521,6 +3749,10 @@ int main(int argc, char ** argv) {
    svr->Post("/v1/embeddings",       handle_embeddings);
    svr->Post("/tokenize",            handle_tokenize);
    svr->Post("/detokenize",          handle_detokenize);
+    if (!sparams.slot_save_path.empty()) {
+        // only enable slot endpoints if slot_save_path is set
+        svr->Post("/slots/:id_slot",  handle_slots_action);
+    }

    //
    // Start the server
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -29,7 +29,7 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
 cd ../../..
 mkdir build
 cd build
-cmake ../
+cmake -DLLAMA_CURL=ON ../
 cmake --build . --target server
 ```

--- a/Show More
+++ b/Show More