quantize: add imatrix and dataset metadata in GGUF (#6658 )

* imatrix: save the dataset file used in the output file * llama: support kv overrides type string string * common: factorize KV Overrides parsing between common and server * quantize: add imatrix n entries and dataset KV metadata quantize: factorize KV Overrides parsing between common #6656 * llama: remove kv override str_value initialization as it does not compile on some toolchain * quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count` * quantize: add imatrix filename in KV * llama: add llama_model_kv_override_free * common: add llama_model_kv_override_free common: free kv override if used after model loading * llama: finally move the string KV override value to the stack * llama : minor * no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators. Co-authored-by: slaren <slarengh@gmail.com> * kv override: ensure string termination --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: slaren <slarengh@gmail.com>
add basic tensor data validation function (#6884 )
2026-02-05 13:53:23 +02:00 · 2024-04-26 20:06:33 +02:00 · 2024-04-26 18:39:58 +02:00 · 2024-04-26 18:07:42 +03:00 · 2024-04-26 17:05:33 +03:00 · 2024-04-26 12:27:25 +02:00
120 changed files with 8566 additions and 10505 deletions
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -32,7 +32,7 @@ on:
    -  cron: '04 2 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true

 jobs:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -32,6 +32,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -52,7 +54,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

      - name: Determine tag name
        id: tag
@@ -88,6 +90,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Dependencies
        id: depends
@@ -101,7 +105,9 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
+          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -204,26 +210,28 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
          cmake --build . --config Release -j $(nproc)

      - name: Test
        id: cmake_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          ctest -L 'main|curl' --verbose --timeout 900

      - name: Test llama2c conversion
        id: llama2c_test
@@ -236,6 +244,33 @@ jobs:
          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
+          name: llama-bin-ubuntu-x64.zip
+
 #  ubuntu-latest-cmake-sanitizer:
 #    runs-on: ubuntu-latest
 #
@@ -938,6 +973,12 @@ jobs:
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+
+      - name: Move artifacts
+        id: move_artifacts
+        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release

      - name: Create release
        id: create_release
@@ -956,7 +997,7 @@ jobs:
            const path = require('path');
            const fs = require('fs');
            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact')) {
+            for (let file of await fs.readdirSync('./artifact/release')) {
              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);
                await github.repos.uploadReleaseAsset({
@@ -964,7 +1005,7 @@ jobs:
                  repo: context.repo.repo,
                  release_id: release_id,
                  name: file,
-                  data: await fs.readFileSync(`./artifact/${file}`)
+                  data: await fs.readFileSync(`./artifact/release/${file}`)
                });
              }
            }
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -91,6 +91,12 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi

+      - name: Downcase github.repository_owner
+        run: |
+          echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
+        env:
+          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
+
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@@ -98,7 +104,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -107,5 +113,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -23,7 +23,7 @@ on:
    -  cron: '2 4 * * *'

 concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true

 jobs:
@@ -58,6 +58,7 @@ jobs:
            git \
            cmake \
            python3-pip \
+            python3-venv \
            curl \
            wget \
            language-pack-en \
@@ -100,10 +101,13 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server

-      - name: Tests dependencies
-        id: test_dependencies
+      - name: Setup python env
+        id: pipenv
        run: |
-          pip install -r examples/server/tests/requirements.txt
+          cd examples/server/tests
+          python3 -m venv venv
+          . venv/bin/activate
+          pip install -r requirements.txt

      - name: Tests
        id: server_integration_tests
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ lcov-report/
 gcovr-report/

 build*
+!build.zig
 cmake-build-*
 out/
 tmp/
@@ -48,6 +49,7 @@ models-mnt
 /convert-llama2c-to-ggml
 /embd-input-test
 /embedding
+/eval-callback
 /gguf
 /gguf-llama-simple
 /gguf-split
@@ -99,6 +101,9 @@ qnt-*.txt
 perf-*.txt

 examples/jeopardy/results.txt
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp

 poetry.lock
 poetry.toml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,8 @@ else()
    set(LLAMA_METAL_DEFAULT OFF)
 endif()

+set(LLAMA_LLAMAFILE_DEFAULT ON)
+
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
@@ -88,6 +90,7 @@ endif()
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
 option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
@@ -286,6 +289,7 @@ if (LLAMA_METAL)
        ${METALKIT_FRAMEWORK}
        )
 endif()
+
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -368,6 +372,13 @@ if (LLAMA_BLAS)
    endif()
 endif()

+if (LLAMA_LLAMAFILE)
+    add_compile_definitions(GGML_USE_LLAMAFILE)
+
+    set(GGML_HEADERS_LLAMAFILE sgemm.h)
+    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+endif()
+
 if (LLAMA_QKK_64)
    add_compile_definitions(GGML_QKK_64)
 endif()
@@ -1151,15 +1162,16 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA}    ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL}  ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}   ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}     ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}   ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}    ${GGML_HEADERS_SYCL}
-            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
-            ${GGML_SOURCES_VULKAN}  ${GGML_HEADERS_VULKAN}
-            ${GGML_SOURCES_ROCM}    ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
+            ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI}       ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}      ${GGML_HEADERS_SYCL}
+            ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
+            ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
+            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
+            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/39
+++ b/39
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

 # Binaries only useful for tests
@@ -384,6 +384,11 @@ ifdef LLAMA_OPENBLAS
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS

+ifndef LLAMA_NO_LLAMAFILE
+	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+	OBJS        += sgemm.o
+endif
+
 ifdef LLAMA_BLIS
 	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
@@ -480,11 +485,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
-
 endif # LLAMA_CUDA

 ifdef LLAMA_CLBLAST
-
 	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
 	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -603,6 +606,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

+ifndef LLAMA_NO_LLAMAFILE
+sgemm.o: sgemm.cpp sgemm.h ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+
 GF_CC := $(CC)
 include scripts/get-flags.mk

@@ -646,7 +654,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
-$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -687,8 +695,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o

 common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -756,11 +764,11 @@ batched: examples/batched/batched.cpp                         ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

@@ -788,10 +796,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% Makefile
+	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+		echo "unsigned char $${NAME}[] = {" && \
+		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+		echo "};" && \
+		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+	) > $@
+
 gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -800,6 +817,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -2,6 +2,45 @@

 import PackageDescription

+var sources = [
+    "ggml.c",
+    "sgemm.cpp",
+    "llama.cpp",
+    "unicode.cpp",
+    "unicode-data.cpp",
+    "ggml-alloc.c",
+    "ggml-backend.c",
+    "ggml-quants.c",
+]
+
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] =  [
+    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+    .unsafeFlags(["-fno-objc-arc"]),
+    // NOTE: NEW_LAPACK will required iOS version 16.4+
+    // We should consider add this in the future when we drop support for iOS 14
+    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+    // .define("ACCELERATE_NEW_LAPACK"),
+    // .define("ACCELERATE_LAPACK_ILP64")
+]
+
+#if canImport(Darwin)
+sources.append("ggml-metal.m")
+resources.append(.process("ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+    contentsOf: [
+        .define("GGML_USE_ACCELERATE"),
+        .define("GGML_USE_METAL")
+    ]
+)
+#endif
+
+#if os(Linux)
+    cSettings.append(.define("_GNU_SOURCE"))
+#endif
+
 let package = Package(
    name: "llama",
    platforms: [
@@ -28,34 +67,11 @@ let package = Package(
               "ggml-cuda.h",
               "Makefile"
            ],
-            sources: [
-                "ggml.c",
-                "llama.cpp",
-                "unicode.cpp",
-                "unicode-data.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m",
-            ],
-            resources: [
-                .process("ggml-metal.metal")
-            ],
+            sources: sources,
+            resources: resources,
            publicHeadersPath: "spm-headers",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
-            linkerSettings: [
-                .linkedFramework("Accelerate")
-            ]
+            cSettings: cSettings,
+            linkerSettings: linkerSettings
        )
    ],
    cxxLanguageStandard: .cxx11
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -8,9 +8,9 @@
 - [Linux](#linux)
 - [Windows](#windows)
 - [Environment Variable](#environment-variable)
- [Known Issue](#known-issue)
- [Q&A](#q&a)
- [Todo](#todo)
+- [Known Issue](#known-issues)
+- [Q&A](#qa)
+- [TODO](#todo)

 ## Background

@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,

 ## OS

-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
-|Windows|Support|Windows 11|
+| OS      | Status  | Verified                           |
+|---------|---------|------------------------------------|
+| Linux   | Support | Ubuntu 22.04, Fedora Silverblue 39 |
+| Windows | Support | Windows 11                         |


 ## Hardware
@@ -66,13 +66,13 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,

 **Verified devices**

-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770, 730M|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
+| Intel GPU                     | Status  | Verified Model                        |
+|-------------------------------|---------|---------------------------------------|
+| Intel Data Center Max Series  | Support | Max 1550, 1100                        |
+| Intel Data Center Flex Series | Support | Flex 170                              |
+| Intel Arc Series              | Support | Arc 770, 730M                         |
+| Intel built-in Arc GPU        | Support | built-in Arc GPU in Meteor Lake       |
+| Intel iGPU                    | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

 *Notes:*

@@ -84,24 +84,18 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
 - **Execution Unit (EU)**
  - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.

-### Nvidia GPU
-The BLAS acceleration on Nvidia GPU through oneAPI can be obtained using the Nvidia plugins for oneAPI and the cuBLAS backend of the upstream oneMKL library. Details and instructions on how to setup the runtime and library can be found in [this section](#i-setup-environment)
+### Other Vendor GPU

 **Verified devices**

-|Nvidia GPU| Status | Verified Model|
-|-|-|-|
-|Ampere Series| Support| A100, A4000|
-|Ampere Series *(Mobile)*| Support| RTX 40 Series|
-
-*Notes:*
-  - Support for Nvidia targets through oneAPI is currently limited to Linux platforms.
-
-  - Please make sure the native oneAPI MKL *(dedicated to intel CPUs and GPUs)* is not "visible" at this stage to properly setup and use the built-from-source oneMKL with cuBLAS backend in llama.cpp for Nvidia GPUs.
-
+| Nvidia GPU               | Status  | Verified Model |
+|--------------------------|---------|----------------|
+| Ampere Series            | Support | A100, A4000    |
+| Ampere Series *(Mobile)* | Support | RTX 40 Series  |

 ## Docker
 The docker build option is currently limited to *intel GPU* targets.
+
 ### Build image
 ```sh
 # Using FP16
@@ -167,30 +161,11 @@ Platform #0: Intel(R) OpenCL HD Graphics

 - **Nvidia GPU**

-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cublas)-* are installed.
-Installation can be verified by running the following:
-```sh
-nvidia-smi
-```
-Please make sure at least one CUDA device is available, which can be displayed like this *(here an A100-40GB Nvidia GPU)*:
-```
-+---------------------------------------------------------------------------------------+
-| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
-|-----------------------------------------+----------------------+----------------------+
-| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
-|                                         |                      |               MIG M. |
-|=========================================+======================+======================|
-|   0  NVIDIA A100-PCIE-40GB          On  | 00000000:8D:00.0 Off |                    0 |
-| N/A   36C    P0              57W / 250W |      4MiB / 40960MiB |      0%      Default |
-|                                         |                      |             Disabled |
-+-----------------------------------------+----------------------+----------------------+
-```
-
+In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.

 2. **Install Intel® oneAPI Base toolkit**

- **Base installation**
+- **For Intel GPU**

 The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.

@@ -202,10 +177,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 - **Adding support to Nvidia GPUs**

-**oneAPI**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
+**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.


-**oneMKL**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.

 ```sh
 git clone https://github.com/oneapi-src/oneMKL
@@ -237,7 +212,7 @@ When targeting an intel GPU, the user should expect one or more level-zero devic

 - **Nvidia GPU**

-Similarly, user targetting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
+Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.12.0.12_195853.xmain-hotfix]
 [opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
@@ -254,12 +229,14 @@ source /opt/intel/oneapi/setvars.sh
 # Build LLAMA with MKL BLAS acceleration for intel GPU
 mkdir -p build && cd build

-# Option 1: Use FP16 for better performance in long-prompt  inference
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-# Or without "--build", run "make" next
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

-# Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+# Option 2: Use FP16
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+#build all binary
+cmake --build . --config Release -j -v
 ```

 #### Nvidia GPU
@@ -273,11 +250,15 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 mkdir -p build && cd build

-# Option 1: Use FP16 for better performance in long-prompt  inference
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+
+#build all binary
+cmake --build . --config Release -j -v

-# Option 2: Use FP32 by default
-cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 ```

 ### III. Run the inference
@@ -313,10 +294,10 @@ found 6 SYCL devices:
 | 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```

-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero driver/runtime, recommended |
-|compute capability 3.0|OpenCL driver/runtime, slower than level-zero in most cases|
+| Attribute              | Note                                                        |
+|------------------------|-------------------------------------------------------------|
+| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
+| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |

 4. Launch inference

@@ -325,10 +306,10 @@ There are two device selection modes:
 - Single device: Use one device target specified by the user.
 - Multiple devices: Automatically select the devices with the same largest Max compute-units.

-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter                              |
+|------------------|----------------------------------------|
+| Single device    | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default)           |

 Examples:

@@ -357,7 +338,6 @@ Otherwise, you can run the script:

 *Notes:*

- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `/bin/main` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

 ```sh
@@ -436,9 +416,13 @@ mkdir -p build
 cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force

+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
+
+# Option 2: Or FP16
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON

-make
+make -j
 ```

 Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
@@ -486,10 +470,10 @@ found 6 SYCL devices:

 ```

-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+| Attribute              | Note                                                      |
+|------------------------|-----------------------------------------------------------|
+| compute capability 1.3 | Level-zero running time, recommended                      |
+| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |


 4. Launch inference
@@ -499,10 +483,10 @@ There are two device selection modes:
 - Single device: Use one device assigned by user.
 - Multiple devices: Automatically choose the devices with the same biggest Max compute units.

-|Device selection|Parameter|
-|-|-|
-|Single device|--split-mode none --main-gpu DEVICE_ID |
-|Multiple devices|--split-mode layer (default)|
+| Device selection | Parameter                              |
+|------------------|----------------------------------------|
+| Single device    | --split-mode none --main-gpu DEVICE_ID |
+| Multiple devices | --split-mode layer (default)           |

 Examples:

@@ -525,7 +509,6 @@ Otherwise, run the following wrapper script:

 Note:

- By default, `mmap` is used to read the model file. In some cases, it causes runtime hang issues. Please disable it by passing `--no-mmap` to the `main.exe` if faced with the issue.
 - Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:

 ```sh
@@ -540,29 +523,23 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 #### Build

-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.|
-|LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA|Set the SYCL target device type.|
-|LLAMA_SYCL_F16|OFF *(default)* \|ON *(optional)*|Enable FP16 build with SYCL code path.|
-|CMAKE_C_COMPILER|icx|Set *icx* compiler for SYCL code path.|
-|CMAKE_CXX_COMPILER|icpx *(Linux)*, icx *(Windows)*|Set `icpx/icx` compiler for SYCL code path.|
+| Name               | Value                             | Function                                    |
+|--------------------|-----------------------------------|---------------------------------------------|
+| LLAMA_SYCL         | ON (mandatory)                    | Enable build with SYCL code path.           |
+| LLAMA_SYCL_TARGET  | INTEL *(default)* \| NVIDIA       | Set the SYCL target device type.            |
+| LLAMA_SYCL_F16     | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path.      |
+| CMAKE_C_COMPILER   | icx                               | Set *icx* compiler for SYCL code path.      |
+| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |

 #### Runtime

-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
+| Name              | Value            | Function                                                                                                                  |
+|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
+| GGML_SYCL_DEBUG   | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG                                                                             |
+| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |

 ## Known Issues

- Hanging during startup
-
-  llama.cpp uses *mmap* as the default mode for reading the model file and copying it to the GPU. In some systems, `memcpy` might behave abnormally and therefore hang.
-
-  - **Solution**: add `--no-mmap` or `--mmap 0` flag to the `main` executable.
-
 - `Split-mode:[row]` is not supported.

 ## Q&A
@@ -574,7 +551,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512

 - General compiler error:

-  - Remove build folder or try a clean-build.
+  - Remove **build** folder or try a clean-build.

 - I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.

@@ -591,6 +568,6 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 ### **GitHub contribution**:
 Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.

-## Todo
+## TODO

 - Support row layer split for multiple card runs.
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

+- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
 - [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
 - [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
@@ -92,9 +93,11 @@ Typically finetunes of the base models below are supported as well.

 - [X] LLaMA 🦙
 - [x] LLaMA 2 🦙🦙
+- [x] LLaMA 3 🦙🦙🦙
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [X] Falcon
+- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
+- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -117,10 +120,12 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf)
 - [x] [Xverse](https://huggingface.co/models?search=xverse)
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
+- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r)
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
+- [x] [OLMo](https://allenai.org/olmo)

 (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))

@@ -132,6 +137,8 @@ Typically finetunes of the base models below are supported as well.
 - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
+- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
+- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)

 **HTTP server**

@@ -188,6 +195,8 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [MindMac](https://mindmac.app) (proprietary)
 - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
 - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
+- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*

 ---
@@ -485,14 +494,14 @@ Building the program with BLAS support may lead to some performance improvements

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

-  | Option                         | Legal values           | Default | Description |
-  |--------------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
-  | LLAMA_CUDA_F16                 | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
-  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       |     128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+  | Option                         | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
+  |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | LLAMA_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
+  | LLAMA_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
+  | LLAMA_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
+  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
+  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |

 - #### hipBLAS

@@ -534,18 +543,18 @@ Building the program with BLAS support may lead to some performance improvements
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):

-  | Option                  | Legal values           | Default | Description |
-  |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+  | Option                  | Legal values           | Default | Description                                                                                                                                                                                                                                    |
+  |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |

 - #### CLBlast

  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
-    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+    - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.

    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.

@@ -570,6 +579,12 @@ Building the program with BLAS support may lead to some performance improvements

  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.

+  Linux packaging:
+  Fedora Linux:
+  ```bash
+  sudo dnf install clblast
+  ```
+
  Alternatively, they may be built from source.

  - <details>
@@ -746,11 +761,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.

 | Model | Original size | Quantized size (Q4_0) |
-|------:|--------------:|-----------------------:|
-|    7B |         13 GB |                 3.9 GB |
-|   13B |         24 GB |                 7.8 GB |
-|   30B |         60 GB |                19.5 GB |
-|   65B |        120 GB |                38.5 GB |
+|------:|--------------:|----------------------:|
+|    7B |         13 GB |                3.9 GB |
+|   13B |         24 GB |                7.8 GB |
+|   30B |         60 GB |               19.5 GB |
+|   65B |        120 GB |               38.5 GB |

 ### Quantization

@@ -758,7 +773,7 @@ Several quantization methods are supported. They differ in the resulting model d

 *(outdated)*

-| Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
+| Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
 |    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
@@ -1106,7 +1121,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+
+![matmul](media/matmul.png)

 ### Docs

--- a/SECURITY.md
+++ b/SECURITY.md
@@ -49,11 +49,11 @@ If you intend to run multiple models in parallel with shared memory, it is your

 1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.

-1. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.

-1. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.

-1. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.

 ## Reporting a vulnerability

--- a/build.zig
+++ b/build.zig
@@ -112,6 +112,7 @@ pub fn build(b: *std.build.Builder) !void {
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;

    const ggml = make.obj("ggml", "ggml.c");
+    const sgemm = make.obj("sgemm", "sgemm.cpp");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
@@ -128,15 +129,44 @@ pub fn build(b: *std.build.Builder) !void {
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
+
+    const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
+    for (server_assets) |asset| {
+        const input_path = b.fmt("examples/server/public/{s}", .{asset});
+        const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
+
+        // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
+
+        const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
+        defer b.allocator.free(input);
+
+        var buf = std.ArrayList(u8).init(b.allocator);
+        defer buf.deinit();
+
+        for (input) |byte| {
+            try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
+        }
+
+        var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
+        defer b.allocator.free(name);
+        std.mem.replaceScalar(u8, name, '.', '_');
+
+        try std.fs.cwd().writeFile(output_path, b.fmt(
+            "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
+            .{ name, buf.items, name, input.len },
+        ));
+
+        std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
+    }
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -153,6 +153,55 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }

+# test_scripts_debug
+
+function gg_run_test_scripts_debug {
+    cd ${SRC}
+
+    set -e
+
+    # TODO: too slow, run on dedicated node
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+   #(cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# test_scripts_release
+
+function gg_run_test_scripts_release {
+    cd ${SRC}
+
+    set -e
+
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+
+    set +e
+}
+
+function gg_sum_test_scripts_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs test scripts in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
 function gg_get_model {
    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
@@ -642,6 +691,9 @@ test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small

+    test $ret -eq 0 && gg_run test_scripts_debug
+    test $ret -eq 0 && gg_run test_scripts_release
+
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
        if [ -z ${GG_BUILD_CUDA} ]; then
            test $ret -eq 0 && gg_run open_llama_3b_v2
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

-set(TARGET json-schema-to-grammar)
-add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)
-
 set(TARGET common)

 add_library(${TARGET} STATIC
@@ -63,6 +60,7 @@ add_library(${TARGET} STATIC
    grammar-parser.h
    grammar-parser.cpp
    json.hpp
+    json-schema-to-grammar.cpp
    train.h
    train.cpp
    ngram-cache.h
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,4 +1,6 @@
 #include "common.h"
+#include "json.hpp"
+#include "json-schema-to-grammar.h"
 #include "llama.h"

 #include <algorithm>
@@ -68,6 +70,8 @@
 #define LLAMA_CURL_MAX_HEADER_LENGTH 256
 #endif // LLAMA_USE_CURL

+using json = nlohmann::ordered_json;
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
    // enumerate the set of thread siblings, num entries is num cores
@@ -104,6 +108,79 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#include <pthread.h>
+
+static void cpuid(unsigned leaf, unsigned subleaf,
+                  unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) {
+    __asm__("movq\t%%rbx,%%rsi\n\t"
+            "cpuid\n\t"
+            "xchgq\t%%rbx,%%rsi"
+            : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx)
+            : "0"(leaf), "2"(subleaf));
+}
+
+static int pin_cpu(int cpu) {
+    cpu_set_t mask;
+    CPU_ZERO(&mask);
+    CPU_SET(cpu, &mask);
+    return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
+}
+
+static bool is_hybrid_cpu(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+    return !!(edx & (1u << 15));
+}
+
+static bool is_running_on_efficiency_core(void) {
+    unsigned eax, ebx, ecx, edx;
+    cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx);
+    int intel_atom = 0x20;
+    int core_type = (eax & 0xff000000u) >> 24;
+    return core_type == intel_atom;
+}
+
+static int count_math_cpus(int cpu_count) {
+    int result = 0;
+    for (int cpu = 0; cpu < cpu_count; ++cpu) {
+        if (pin_cpu(cpu)) {
+            return -1;
+        }
+        if (is_running_on_efficiency_core()) {
+            continue; // efficiency cores harm lockstep threading
+        }
+        ++cpu; // hyperthreading isn't useful for linear algebra
+        ++result;
+    }
+    return result;
+}
+
+#endif // __x86_64__ && __linux__
+
+/**
+ * Returns number of CPUs on system that are useful for math.
+ */
+int get_math_cpu_count() {
+#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+    int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
+    if (cpu_count < 1) {
+        return get_num_physical_cores();
+    }
+    if (is_hybrid_cpu()) {
+        cpu_set_t affinity;
+        if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
+            int result = count_math_cpus(cpu_count);
+            pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
+            if (result > 0) {
+                return result;
+            }
+        }
+    }
+#endif
+    return get_num_physical_cores();
+}
+
 void process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@@ -157,15 +234,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }

+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char * sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.val_i64 = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.val_f64 = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.val_bool = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.val_bool = false;
+        } else {
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+    } else if (strncmp(sep, "str:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+        if (strlen(sep) > 127) {
+            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            return false;
+        }
+        strncpy(kvo.val_str, sep, 127);
+        kvo.val_str[127] = '\0';
+    } else {
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    llama_sampling_params& sparams = params.sparams;
+    llama_sampling_params & sparams = params.sparams;

    if (arg == "-s" || arg == "--seed") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
+        // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
        params.seed = std::stoul(argv[i]);
+        sparams.seed = std::stoul(argv[i]);
        return true;
    }
    if (arg == "-t" || arg == "--threads") {
@@ -1010,6 +1135,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_print = std::stoi(argv[i]);
        return true;
    }
+    if (arg == "--check-tensors") {
+        params.check_tensors = true;
+        return true;
+    }
    if (arg == "--ppl-output-type") {
        if (++i >= argc) {
            invalid_param = true;
@@ -1148,52 +1277,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        );
        return true;
    }
+    if (arg == "-j" || arg == "--json-schema") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        sparams.grammar = json_schema_to_grammar(json::parse(argv[i]));
+        return true;
+    }
    if (arg == "--override-kv") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
-        char* sep = strchr(argv[i], '=');
-        if (sep == nullptr || sep - argv[i] >= 128) {
-            fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-            invalid_param = true;
-            return true;
-        }
-        struct llama_model_kv_override kvo;
-        std::strncpy(kvo.key, argv[i], sep - argv[i]);
-        kvo.key[sep - argv[i]] = 0;
-        sep++;
-        if (strncmp(sep, "int:", 4) == 0) {
-            sep += 4;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.int_value = std::atol(sep);
-        }
-        else if (strncmp(sep, "float:", 6) == 0) {
-            sep += 6;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-            kvo.float_value = std::atof(sep);
-        }
-        else if (strncmp(sep, "bool:", 5) == 0) {
-            sep += 5;
-            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-            if (std::strcmp(sep, "true") == 0) {
-                kvo.bool_value = true;
-            }
-            else if (std::strcmp(sep, "false") == 0) {
-                kvo.bool_value = false;
-            }
-            else {
-                fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                return true;
-            }
-        }
-        else {
+        if (!parse_kv_override(argv[i], params.kv_overrides)) {
            fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
            invalid_param = true;
            return true;
        }
-        params.kv_overrides.push_back(kvo);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
@@ -1353,6 +1454,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    printf("  --grammar-file FNAME  file to read grammar from\n");
+    printf("  -j SCHEMA, --json-schema SCHEMA\n");
+    printf("                        JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
+    printf("                        For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
    printf("  --cfg-negative-prompt PROMPT\n");
    printf("                        negative prompt to use for guidance. (default: empty)\n");
    printf("  --cfg-negative-prompt-file FNAME\n");
@@ -1461,9 +1565,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -ptc N, --print-token-count N\n");
    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
+    printf("  --check-tensors       check model tensor data for invalid values\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@@ -1684,6 +1789,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
+    mparams.check_tensors   = params.check_tensors;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -1745,6 +1851,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.pooling_type      = params.pooling_type;
    cparams.defrag_thold      = params.defrag_thold;
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
    cparams.offload_kqv       = !params.no_kv_offload;

    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2192,7 +2300,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
    }

-    {
+    if (params.warmup) {
        LOG("warming up the model with an empty run\n");

        std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
@@ -2236,12 +2344,12 @@ std::vector<llama_token> llama_tokenize(
    return result;
 }

-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
--- a/common/common.h
+++ b/common/common.h
@@ -39,6 +39,7 @@ extern char const *LLAMA_BUILD_TARGET;

 struct llama_control_vector_load_info;

+int get_math_cpu_count();
 int32_t get_num_physical_cores();

 //
@@ -48,7 +49,7 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed

-    int32_t n_threads             = get_num_physical_cores();
+    int32_t n_threads             = get_math_cpu_count();
    int32_t n_threads_draft       = -1;
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
@@ -80,10 +81,13 @@ struct gpt_params {
    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold

+    ggml_backend_sched_eval_callback cb_eval = nullptr;
+    void * cb_eval_user_data                 = nullptr;
+
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;

-    llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
-    llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
+    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;
@@ -156,6 +160,8 @@ struct gpt_params {
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
+    bool warmup            = true;  // warmup run
+    bool check_tensors     = false; // validate tensor data

    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
@@ -165,6 +171,8 @@ struct gpt_params {
    std::string image  = ""; // path to an image file
 };

+bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
+
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@@ -232,11 +240,12 @@ std::vector<llama_token> llama_tokenize(
                        bool   add_special,
                        bool   parse_special = false);

-// tokenizes a token into a piece
+// tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
 std::string llama_token_to_piece(
        const struct llama_context * ctx,
-                       llama_token   token);
+                       llama_token   token,
+                       bool          special = true);

 // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
 //       that takes into account the tokenizer type and decides how to handle the leading space
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -11,35 +11,101 @@

 using json = nlohmann::ordered_json;

+template <typename Iterator>
+static std::string join(Iterator begin, Iterator end, const std::string & separator);
+
+static std::string repeat(const std::string & str, size_t n);
+
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+    if (separator_rule.empty()) {
+        if (min_items == 0 && max_items == 1) {
+            return item_rule + "?";
+        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
+            return item_rule + "+";
+        }
+    }
+
+    std::string result;
+    if (min_items > 0) {
+        if (item_rule_is_literal && separator_rule.empty()) {
+            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+        } else {
+            std::vector<std::string> items(min_items, item_rule);
+            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+        }
+    }
+
+    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
+        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
+
+        if (up_to_n == 0) {
+            return "";
+        } else if (up_to_n == 1) {
+            return "(" + content + ")?";
+        } else if (!separator_rule.empty() && !prefix_with_sep) {
+            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
+        } else {
+            std::string res = repeat("(" + content + " ", up_to_n);
+            // strip trailing space
+            res = res.substr(0, res.length() - 1);
+            res += repeat(")?", up_to_n);
+            return res;
+        }
+    };
+
+    if (min_items > 0 && max_items != min_items) {
+        result += " ";
+    }
+
+    if (max_items != std::numeric_limits<int>::max()) {
+        result += opt_repetitions(max_items - min_items, min_items > 0);
+    } else {
+        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
+        if (min_items == 0 && !separator_rule.empty()) {
+            result = "(" + item_rule + " " + item_operator + "*)?";
+        } else {
+            result += item_operator + "*";
+        }
+    }
+
+    return result;
+}
+
 const std::string SPACE_RULE = "\" \"?";

-std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
-    {"boolean", "(\"true\" | \"false\") space"},
-    {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
-    {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
-    {"value", "object | array | string | number | boolean"},
-    {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
-    {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
-    {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
-    {"string", " \"\\\"\" (\n"
-               "        [^\"\\\\] |\n"
-               "        \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
-               "      )* \"\\\"\" space"},
-    {"null", "\"null\" space"}
+struct BuiltinRule {
+    std::string content;
+    std::vector<std::string> deps;
 };
-std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};

-std::unordered_map<std::string, std::string> DATE_RULES = {
-    {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
-    {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
-    {"date-time", "date \"T\" time"},
-    {"date-string", "\"\\\"\" date \"\\\"\" space"},
-    {"time-string", "\"\\\"\" time \"\\\"\" space"},
-    {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
+
+std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
+    {"boolean", {"(\"true\" | \"false\") space", {}}},
+    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
+    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+    {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
+    {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
+    {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
+    {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
+    {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+    {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
+    {"null", {"\"null\" space", {}}},
+};
+
+std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
+    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"date-time", {"date \"T\" time", {"date", "time"}}},
+    {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
+    {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
+    {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}}
 };

 static bool is_reserved_name(const std::string & name) {
@@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) {
    if (RESERVED_NAMES.empty()) {
        RESERVED_NAMES.insert("root");
        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
-        for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
    }
    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
 }
@@ -192,7 +258,7 @@ private:
                if (_dotall) {
                    rule = "[\\U00000000-\\U0010FFFF]";
                } else {
-                    rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+                    rule = "[^\\x0A\\x0D]";
                }
                return _add_rule("dot", rule);
            };
@@ -308,47 +374,21 @@ private:
                    auto &sub = last.first;
                    auto sub_is_literal = last.second;

-                    if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
-                        sub += "*";
-                    } else if (min_times == 0 && max_times == 1) {
-                        sub += "?";
-                    } else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
-                        sub += "+";
-                    } else {
-                        if (!sub_is_literal) {
-                            std::string & sub_id = sub_rule_ids[sub];
-                            if (sub_id.empty()) {
-                                sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
-                            }
-                            sub = sub_id;
+                    if (!sub_is_literal) {
+                        std::string & sub_id = sub_rule_ids[sub];
+                        if (sub_id.empty()) {
+                            sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
                        }
-                        std::string result;
-                        if (sub_is_literal && min_times > 0) {
-                            result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
-                        } else {
-                            for (int j = 0; j < min_times; j++) {
-                                if (j > 0) {
-                                    result += " ";
-                                }
-                                result += sub;
-                            }
-                        }
-                        if (min_times > 0 && min_times < max_times) {
-                            result += " ";
-                        }
-                        if (max_times == std::numeric_limits<int>::max()) {
-                            result += sub + "*";
-                        } else {
-                            for (int j = min_times; j < max_times; j++) {
-                                if (j > min_times) {
-                                    result += " ";
-                                }
-                                result += sub + "?";
-                            }
-                        }
-                        seq.back().first = result;
-                        seq.back().second = false;
+                        sub = sub_id;
                    }
+                    seq.back().first = build_repetition(
+                        sub_is_literal ? "\"" + sub + "\"" : sub,
+                        min_times,
+                        max_times,
+                        "",
+                        sub_is_literal
+                    );
+                    seq.back().second = false;
                } else {
                    std::string literal;
                    auto is_non_literal = [&](char c) {
@@ -424,7 +464,7 @@ private:
        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
-            std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
            prop_kv_rule_names["*"] = kv_rule;
            optional_props.push_back("*");
        }
@@ -486,6 +526,25 @@ private:
        return rule;
    }

+    std::string _add_primitive(const std::string & name, const BuiltinRule & rule) {
+        auto n = _add_rule(name, rule.content);
+        for (const auto & dep : rule.deps) {
+            BuiltinRule dep_rule;
+            auto it = PRIMITIVE_RULES.find(dep);
+            if (it == PRIMITIVE_RULES.end()) {
+                it = STRING_FORMAT_RULES.find(dep);
+                if (it == STRING_FORMAT_RULES.end()) {
+                    _errors.push_back("Rule " + dep + " not known");
+                    continue;
+                }
+            }
+            if (_rules.find(dep) == _rules.end()) {
+                _add_primitive(dep, it->second);
+            }
+        }
+        return n;
+    }
+
 public:
    SchemaConverter(
        const std::function<json(const std::string &)> & fetch_json,
@@ -647,49 +706,33 @@ public:
                return _add_rule(rule_name, rule);
            } else {
                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
-                std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
-                std::string successive_items;
                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
-                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
-                if (min_items > 0) {
-                    successive_items += repeat(list_item_operator, min_items - 1);
-                    min_items--;
-                }
-                if (max_items >= 0 && max_items > min_items) {
-                    successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
-                } else {
-                    successive_items += list_item_operator + "*";
-                }
-                std::string rule;
-                if (min_items == 0) {
-                    rule =  "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
-                } else {
-                    rule =  "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
-                }
-                return _add_rule(rule_name, rule);
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : std::numeric_limits<int>::max();
+
+                return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space");
            }
        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
            return _visit_pattern(schema["pattern"], rule_name);
        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
-            return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
-        } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
-            for (const auto & kv : DATE_RULES) {
-                _add_rule(kv.first, kv.second);
-            }
-            return schema_format + "-string";
+            return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+        } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) {
+            auto prim_name = schema_format + "-string";
+            return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name)));
+        } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) {
+            std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
+            int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
+            int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
+            return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
        } else if (schema.empty() || schema_type == "object") {
-            for (const auto & n : OBJECT_RULE_NAMES) {
-                _add_rule(n, PRIMITIVE_RULES.at(n));
-            }
-            return _add_rule(rule_name, "object");
+            return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
        } else {
            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
                _errors.push_back("Unrecognized schema: " + schema.dump());
                return "";
            }
            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+            return _add_primitive(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
        }
    }

--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,4 +1,6 @@
+#define LLAMA_API_INTERNAL
 #include "sampling.h"
+#include <random>

 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
    struct llama_sampling_context * result = new llama_sampling_context();
@@ -33,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_

    result->prev.resize(params.n_prev);

+    llama_sampling_set_rng_seed(result, params.seed);
+
    return result;
 }

@@ -62,6 +66,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
    ctx->cur.clear();
 }

+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        seed = time(NULL);
+    }
+    ctx->rng.seed(seed);
+}
+
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
    if (dst->grammar) {
        llama_grammar_free(dst->grammar);
@@ -203,7 +214,7 @@ static llama_token llama_sampling_sample_impl(

            sampler_queue(ctx_main, params, cur_p, min_keep);

-            id = llama_sample_token(ctx_main, &cur_p);
+            id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng);

            //{
            //    const int n_top = 10;
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -4,9 +4,10 @@

 #include "grammar-parser.h"

+#include <random>
 #include <string>
-#include <vector>
 #include <unordered_map>
+#include <vector>

 // sampler types
 enum class llama_sampler_type : char {
@@ -20,25 +21,26 @@ enum class llama_sampler_type : char {

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     min_keep              = 0;        // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.00f;    // 1.0 = disabled
-    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;    // target entropy
-    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = false;     // consider newlines as a repeatable token
+    int32_t     n_prev                = 64;                 // number of previous tokens to remember
+    int32_t     n_probs               = 0;                  // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t     min_keep              = 0;                  // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t     top_k                 = 40;                 // <= 0 to use vocab size
+    float       top_p                 = 0.95f;              // 1.0 = disabled
+    float       min_p                 = 0.05f;              // 0.0 = disabled
+    float       tfs_z                 = 1.00f;              // 1.0 = disabled
+    float       typical_p             = 1.00f;              // 1.0 = disabled
+    float       temp                  = 0.80f;              // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float       dynatemp_range        = 0.00f;              // 0.0 = disabled
+    float       dynatemp_exponent     = 1.00f;              // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t     penalty_last_n        = 64;                 // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float       penalty_repeat        = 1.00f;              // 1.0 = disabled
+    float       penalty_freq          = 0.00f;              // 0.0 = disabled
+    float       penalty_present       = 0.00f;              // 0.0 = disabled
+    int32_t     mirostat              = 0;                  // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float       mirostat_tau          = 5.00f;              // target entropy
+    float       mirostat_eta          = 0.10f;              // learning rate
+    bool        penalize_nl           = false;              // consider newlines as a repeatable token
+    uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context

    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
@@ -79,6 +81,8 @@ struct llama_sampling_context {
    // TODO: replace with ring-buffer
    std::vector<llama_token>      prev;
    std::vector<llama_token_data> cur;
+
+    std::mt19937 rng;
 };

 #include "common.h"
@@ -93,6 +97,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx);
 // - reset grammar
 void llama_sampling_reset(llama_sampling_context * ctx);

+// Set the sampler seed
+void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed);
+
 // Copy the sampler context
 void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);

--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -43,17 +43,18 @@ AnyModel = TypeVar("AnyModel", bound="type[Model]")
 class Model(ABC):
    _model_classes: dict[str, type[Model]] = {}

-    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
+    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
        self.dir_model = dir_model
        self.ftype = ftype
        self.fname_out = fname_out
        self.is_big_endian = is_big_endian
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.use_temp_file = use_temp_file
        self.is_safetensors = self._is_model_safetensors()
        self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])

    @property
@@ -362,6 +363,16 @@ class Model(ABC):
                        scores.append(-1000.0)
                        toktypes.append(SentencePieceTokenTypes.USER_DEFINED)

+        if vocab_size > len(tokens):
+            pad_count = vocab_size - len(tokens)
+            print(
+                f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
+            )
+            for i in range(1, pad_count + 1):
+                tokens.append(f"[PAD{i}]")
+                scores.append(-1000.0)
+                toktypes.append(SentencePieceTokenTypes.UNUSED)
+
        assert len(tokens) == vocab_size

        self.gguf_writer.add_tokenizer_model("llama")
@@ -1206,9 +1217,91 @@ class StableLMModel(Model):
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))

+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_head = self.hparams.get("num_attention_heads")
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        q_norms = dict()
+        k_norms = dict()
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+            n_dims = len(data.shape)
+            if name.find("q_layernorm.norms") != -1:
+                q_norms[name] = data
+                if len(q_norms) >= (block_count * n_head):
+                    self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
+                continue
+            if name.find("k_layernorm.norms") != -1:
+                k_norms[name] = data
+                if len(k_norms) >= (block_count * n_kv_head):
+                    self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
+                continue
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+    def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
+        for bid in range(block_count):
+            datas = []
+            for xid in range(n_head):
+                ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
+                datas.append(norms[ename])
+                del norms[ename]
+            data = np.stack(datas, axis=0)
+            data_dtype = data.dtype
+            merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
+            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+

@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
@@ -1218,7 +1311,23 @@ class LlamaModel(Model):
        try:
            self. _set_vocab_sentencepiece()
        except FileNotFoundError:
-            self._set_vocab_llama_hf()
+            try:
+                self._set_vocab_llama_hf()
+            except (FileNotFoundError, TypeError):
+                # Llama 3
+                self._set_vocab_gpt2()
+
+        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
+        if self.hparams.get("vocab_size", 32000) == 32016:
+            special_vocab = gguf.SpecialVocab(
+                self.dir_model, load_merges=False,
+                special_token_types = ['prefix', 'suffix', 'middle', 'eot']
+            )
+            special_vocab._set_special_token("prefix", 32007)
+            special_vocab._set_special_token("suffix", 32008)
+            special_vocab._set_special_token("middle", 32009)
+            special_vocab._set_special_token("eot",    32010)
+            special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
@@ -1427,6 +1536,102 @@ class GrokModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+@Model.register("DbrxForCausalLM")
+class DbrxModel(Model):
+    model_arch = gguf.MODEL_ARCH.DBRX
+
+    def set_gguf_parameters(self):
+        ffn_config = self.hparams["ffn_config"]
+        attn_config = self.hparams["attn_config"]
+        self.gguf_writer.add_name(self.hparams["model_type"])
+        self.gguf_writer.add_block_count(self.hparams["n_layers"])
+
+        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
+
+        self.gguf_writer.add_head_count(self.hparams["n_heads"])
+        self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
+
+        self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
+
+        self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+        self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
+        self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
+
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+
+        self.gguf_writer.add_file_type(self.ftype)
+        print(f"gguf: file type = {self.ftype}")
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers")
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        for name, data_torch in self.get_tensors():
+            n_expert = self.hparams["ffn_config"]["moe_num_experts"]
+            n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
+            n_embd = self.hparams["d_model"]
+
+            # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
+            # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
+            # But llama.cpp moe graph works differently
+            # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
+            # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
+            exp_tensor_names = {"ffn.experts.mlp.w1": None,       # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
+                                "ffn.experts.mlp.w2": (0, 2, 1),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
+                                "ffn.experts.mlp.v1": None}       # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
+            experts = False
+            for exp_tensor_name in exp_tensor_names.keys():
+                if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
+                    experts = True
+                    data_torch = data_torch.view(n_expert, n_ff, n_embd)
+                    if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
+                        data_torch = data_torch.permute(*permute_tensor)
+                    break
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            # In MoE models the ffn tensors are typically most of the model weights,
+            # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
+            # Every other model has the weight names ending in .weight,
+            # let's assume that is the convention which is not the case for dbrx:
+            # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
+            new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # Most of the codebase that takes in 1D tensors only handles F32 tensors
+            # and most of the outputs tensors are F32.
+            if data_dtype != np.float32 and n_dims == 1:
+                print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
+                sys.exit()
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM
@@ -1594,6 +1799,111 @@ class QwenModel(Model):
 class Qwen2Model(Model):
    model_arch = gguf.MODEL_ARCH.QWEN2

+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+
+@Model.register("Qwen2MoeForCausalLM")
+class Qwen2MoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2MOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_experts = self.hparams.get("num_experts")
+        experts = dict()
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # process the experts separately
+            if name.find("experts") != -1:
+                experts[name] = data
+                if len(experts) >= n_experts * 3:
+                    # merge the experts into a single 3d tensor
+                    for bid in range(block_count):
+                        for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                            full = True
+                            for xid in range(n_experts):
+                                ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                                if ename not in experts:
+                                    full = False
+                                    break
+                            if not full:
+                                continue
+
+                            datas = []
+                            for xid in range(n_experts):
+                                ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                                datas.append(experts[ename])
+                                del experts[ename]
+
+                            data = np.stack(datas, axis=0)
+                            data_dtype = data.dtype
+
+                            if self.ftype == 0 and data_dtype == np.float16:
+                                data = data.astype(np.float32)
+
+                            if self.ftype == 1 and data_dtype == np.float32:
+                                data = data.astype(np.float16)
+
+                            merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                            new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
+                            if new_name is None:
+                                print(f"Can not map tensor {name!r}")
+                                sys.exit()
+
+                            print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
+
+                            self.gguf_writer.add_tensor(new_name, data)
+                continue
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+        if len(experts) > 0:
+            raise ValueError(f"Unprocessed experts: {experts.keys()}")
+

@Model.register("GPT2LMHeadModel")
 class GPT2Model(Model):
@@ -1685,6 +1995,91 @@ class Phi2Model(Model):
        self.gguf_writer.add_add_bos_token(False)


+@Model.register("Phi3ForCausalLM")
+class Phi3MiniModel(Model):
+    model_arch = gguf.MODEL_ARCH.PHI3
+
+    def set_vocab(self):
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        if not tokenizer_path.is_file():
+            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
+            sys.exit(1)
+
+        tokenizer = SentencePieceProcessor(str(tokenizer_path))
+
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+        scores: list[float] = [-10000.0] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
+        for token_id in range(tokenizer.vocab_size()):
+
+            piece = tokenizer.id_to_piece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.get_score(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.is_unknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.is_control(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.is_unused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.is_byte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens[token_id] = text
+            scores[token_id] = score
+            toktypes[token_id] = toktype
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+
+                for key in added_tokens_json:
+                    token_id = added_tokens_json[key]
+                    if (token_id >= vocab_size):
+                        print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+                        continue
+
+                    tokens[token_id] = key.encode("utf-8")
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def set_gguf_parameters(self):
+        block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
+
+        rot_pct = 1.0
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        rms_eps = self.find_hparam(["rms_norm_eps"])
+
+        self.gguf_writer.add_name("Phi3")
+        self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
+
+        self.gguf_writer.add_embedding_length(n_embd)
+        self.gguf_writer.add_feed_forward_length(8192)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
+        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+        self.gguf_writer.add_file_type(self.ftype)
+
+
@Model.register("PlamoForCausalLM")
 class PlamoModel(Model):
    model_arch = gguf.MODEL_ARCH.PLAMO
@@ -1908,6 +2303,8 @@ class InternLM2Model(Model):
        old_eos = special_vocab.special_token_ids["eos"]
        if "chat" in os.path.basename(self.dir_model.absolute()):
            # For the chat model, we replace the eos with '<|im_end|>'.
+            # TODO: this is a hack, should be fixed
+            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
            special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
            print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
 in chat mode so that the conversation can end normally.")
@@ -2144,6 +2541,16 @@ class GemmaModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()

+        # TODO: these special tokens should be exported only for the CodeGemma family
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
+                                          special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
+        special_vocab._set_special_token("prefix", 67)
+        special_vocab._set_special_token("suffix", 69)
+        special_vocab._set_special_token("middle", 68)
+        special_vocab._set_special_token("fsep",   70)
+        special_vocab._set_special_token("eot",    107)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@@ -2165,6 +2572,12 @@ class GemmaModel(Model):
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)

        for name, data_torch in self.get_tensors():
+            # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+            # To prevent errors, skip loading lm_head.weight.
+            if name == "lm_head.weight":
+                print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
+                continue
+
            old_dtype = data_torch.dtype

            # convert any unsupported data types to float32
@@ -2224,28 +2637,34 @@ class MambaModel(Model):

            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
            self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
            self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
            self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+
            field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])

    def set_gguf_parameters(self):
-        d_model = self.find_hparam(["hidden_size", "d_model"])
-        d_conv  = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+        d_model = self.find_hparam(["hidden_size",       "d_model"])
+        d_conv  = self.find_hparam(["conv_kernel",       "d_conv"],  optional=True) or 4
        d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
-        d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
+        d_state = self.find_hparam(["state_size",        "d_state"], optional=True) or 16
        # ceiling division
        # ref: https://stackoverflow.com/a/17511341/22827863
        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
-        dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+        dt_rank      = self.find_hparam(["time_step_rank",     "dt_rank"],      optional=True) or -(d_model // -16)
        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5

        # Fail early for models which don't have a block expansion factor of 2
@@ -2337,6 +2756,66 @@ class CommandR2Model(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)


+@Model.register("OlmoForCausalLM")
+@Model.register("OLMoForCausalLM")
+class OlmoModel(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+        if "clip_qkv" in self.hparams is not None:
+            self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
+
+    # Same as super class, but permuting q_proj, k_proj
+    # Copied from: LlamaModel
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_head = self.hparams.get("num_attention_heads")
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        for name, data_torch in self.get_tensors():
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.numpy()
+
+            if name.endswith("q_proj.weight"):
+                data = permute(data, n_head, n_head)
+            if name.endswith("k_proj.weight"):
+                data = permute(data, n_head, n_kv_head)
+
+            data = data.squeeze()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # 1d tensors need to be converted to float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
 ###### CONVERSION LOGIC ######


@@ -2363,6 +2842,7 @@ def parse_args() -> argparse.Namespace:
        "model", type=Path,
        help="directory containing model file",
    )
+    parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")

    return parser.parse_args()

@@ -2406,7 +2886,7 @@ def main() -> None:

    with torch.inference_mode():
        model_class = Model.from_model_architecture(hparams["architectures"][0])
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)

        print("Set model parameters")
        model_instance.set_gguf_parameters()
--- a/convert.py
+++ b/convert.py
@@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):

        # pre-check so we know if we need transformers
        tokenizer_model: dict[str, Any] = tokenizer_json['model']
-        if (
+        is_llama3 = (
+            tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
+            and not tokenizer_model.get('byte_fallback', True)
+        )
+        if is_llama3:
+            raise TypeError('Llama 3 must be converted with BpeVocab')
+
+        if not is_llama3 and (
            tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
            or tokenizer_json['decoder']['type'] != 'Sequence'
        ):
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil

 When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.

+Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
+
 ## GGUF specification

 https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
+    add_subdirectory(eval-callback)
    add_subdirectory(finetune)
    add_subdirectory(gritlm)
    add_subdirectory(gguf-split)
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {

 private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
+    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
    if nTokens < 0 {
        let actualTokensCount = -Int(nTokens)
        result = .init(repeating: 0, count: actualTokensCount)
@@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
            model,
            token,
            &result,
-            Int32(result.count)
+            Int32(result.count),
+            false
        )
        assert(check == actualTokensCount)
    } else {
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -191,8 +191,8 @@ int main(int argc, char ** argv) {

            //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of stream? -> mark the stream as finished
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation? -> mark the stream as finished
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                i_batch[i] = -1;
                LOG_TEE("\n");
                if (n_parallel > 1) {
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -47,7 +47,7 @@ struct beam_search_callback_data {
 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
 static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
+    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
 }

 // Function matching type llama_beam_search_callback_fn_t.
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET eval-callback)
+add_executable(${TARGET} eval-callback.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TEST_TARGET test-eval-callback)
+add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/eval-callback/README.md
+++ b/examples/eval-callback/README.md
@@ -0,0 +1,95 @@
+# llama.cpp/examples/eval-callback
+
+A simple example which demonstrates how to use callback during the inference.
+It simply prints to the console all operations and tensor data.
+
+Usage:
+
+```shell
+eval-callback \
+  --hf-repo ggml-org/models \
+  --hf-file phi-2/ggml-model-q4_0.gguf \
+  --model phi-2-q4_0.gguf \
+  --prompt hello \
+  --seed 42 \
+  -ngl 33
+```
+
+Will print:
+
+```shell
+llm_load_tensors: offloaded 33/33 layers to GPU
+...
+llama_new_context_with_model: n_ctx      = 512
+...
+llama_new_context_with_model:      CUDA0 compute buffer size =   105.00 MiB
+llama_new_context_with_model:  CUDA_Host compute buffer size =     6.01 MiB
+llama_new_context_with_model: graph nodes  = 1225
+llama_new_context_with_model: graph splits = 2
+ggml_debug:                 inp_embd = (f32)   GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.0181,   0.0272,   0.0272, ...],
+                                      ],
+                                     ]
+ggml_debug:                   norm-0 = (f32)       NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.6989,   1.0636,   1.0636, ...],
+                                      ],
+                                     ]
+ggml_debug:                 norm_w-0 = (f32)        MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.1800,   0.2817,   0.2632, ...],
+                                      ],
+                                     ]
+ggml_debug:              attn_norm-0 = (f32)        ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -0.1863,   0.2970,   0.2604, ...],
+                                      ],
+                                     ]
+ggml_debug:                   wqkv-0 = (f32)    MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1238,   1.2876,  -1.8086, ...],
+                                      ],
+                                     ]
+ggml_debug:                   bqkv-0 = (f32)        ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:            bqkv-0 (view) = (f32)       VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:                   Qcur-0 = (f32)       CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                      ],
+                                     ]
+ggml_debug:        Qcur-0 (reshaped) = (f32)    RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                       [ -0.3608,   0.5076,  -1.8866, ...],
+                                       [  1.7643,   0.0273,  -2.1065, ...],
+                                       ...
+                                      ],
+                                     ]
+ggml_debug:                   Qcur-0 = (f32)       ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
+                                     [
+                                      [
+                                       [ -1.1135,   1.4604,  -1.9226, ...],
+                                       [ -0.3608,   0.5076,  -1.8866, ...],
+                                       [  1.7643,   0.0273,  -2.1065, ...],
+                                       ...
+                                      ],
+                                     ]
+```
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -0,0 +1,195 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <cstdio>
+#include <random>
+#include <string>
+#include <tuple>
+#include <vector>
+
+/**
+ * This the arbitrary data which will be passed to each callback.
+ * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
+ */
+struct callback_data {
+    std::vector<uint8_t> data;
+};
+
+static std::string ggml_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
+    GGML_ASSERT(n > 0);
+    float sum = 0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        printf("                                     [\n");
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            if (i2 == n && ne[2] > 2*n) {
+                printf("                                      ..., \n");
+                i2 = ne[2] - n;
+            }
+            printf("                                      [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                if (i1 == n && ne[1] > 2*n) {
+                    printf("                                       ..., \n");
+                    i1 = ne[1] - n;
+                }
+                printf("                                       [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    if (i0 == n && ne[0] > 2*n) {
+                        printf("..., ");
+                        i0 = ne[0] - n;
+                    }
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) data + i;
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) data + i;
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) data + i;
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) data + i;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    printf("%12.4f", v);
+                    sum += v;
+                    if (i0 < ne[0] - 1) printf(", ");
+                }
+                printf("],\n");
+            }
+            printf("                                      ],\n");
+        }
+        printf("                                     ]\n");
+        printf("                                     sum = %f\n", sum);
+    }
+}
+
+/**
+ * GGML operations callback during the graph execution.
+ *
+ * @param t current tensor
+ * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
+ *            if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
+ *            see ggml_backend_sched_eval_callback
+ * @param user_data user data to pass at each call back
+ * @return true to receive data or continue the graph, false otherwise
+ */
+static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
+    auto * cb_data = (callback_data *) user_data;
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true; // Always retrieve data
+    }
+
+    char src1_str[128] = {0};
+    if (src1) {
+        sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+    }
+
+    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+           t->name, ggml_type_name(t->type), ggml_op_desc(t),
+           src0->name, ggml_ne_string(src0).c_str(),
+           src1 ? src1_str : "",
+           ggml_ne_string(t).c_str());
+
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(t->buffer);
+
+    if (!is_host) {
+        auto n_bytes = ggml_nbytes(t);
+        cb_data->data.resize(n_bytes);
+        ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
+    }
+
+    if (!ggml_is_quantized(t->type)) {
+        uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
+        ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
+    }
+
+    return true;
+}
+
+static bool run(llama_context * ctx, const gpt_params & params) {
+    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return false;
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+
+    callback_data cb_data;
+
+    gpt_params params;
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    print_build_info();
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    params.cb_eval = ggml_debug;
+    params.cb_eval_user_data = &cb_data;
+    params.warmup = false;
+
+    // init
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+    }
+
+    bool OK = run(ctx, params);
+    if (!OK) {
+        return 1;
+    }
+
+    llama_print_timings(ctx);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -17,7 +17,7 @@ static bool llama_sample_grammar_string(struct llama_grammar * grammar, const st
    size_t pos = 0;
    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
        auto prev_stacks = grammar->stacks;
-        grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
+        llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks);
        if (grammar->stacks.empty()) {
            error_pos = pos;
            error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'";
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@@ -5,5 +5,6 @@ CLI to split / merge GGUF files.
 **Command line options:**

 - `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`.
 - `--split-max-tensors`: maximum tensors in each split: default(128)
 - `--merge`: merge multiple GGUF to a single GGUF.
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -59,10 +59,10 @@ static size_t split_str_to_n_bytes(std::string str) {
    int n;
    if (str.back() == 'M') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024; // megabytes
+        n_bytes = (size_t)n * 1024 * 1024; // megabytes
    } else if (str.back() == 'G') {
        sscanf(str.c_str(), "%d", &n);
-        n_bytes = n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
    } else {
        throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
    }
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/gguf-split
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split with max tensors strategy
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 2b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 3. Merge
+$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf
+echo PASS
+echo
+
+# 3b. Test the merged model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Split with no tensor in metadata
+#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
+#echo PASS
+#echo
+
+# 4b. Test the sharded model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 5. Merge
+#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf
+#echo PASS
+#echo
+
+# 5b. Test the merged model is loading properly
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#echo PASS
+#echo
+
+# 6. Split with size strategy
+$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G
+echo PASS
+echo
+
+# 6b. Test the sharded model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -21,12 +21,12 @@ not have to be performed at all.
 ### Running the example
 Download a Grit model:
 ```console
-$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models
 ```

 Run the example using the downloaded model:
 ```console
-$ ./gritlm -m gritlm-7b_q4_1.gguf
+$ ./gritlm -m models/gritlm-7b_q4_1.gguf

 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -23,6 +23,7 @@ struct Stats {
 };

 struct StatParams {
+    std::string dataset;
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
@@ -44,9 +45,9 @@ private:
    std::mutex                             m_mutex;
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
-    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
+    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
-    void save_imatrix(const char * file_name) const;
+    void save_imatrix(const char * file_name, const char * dataset) const;
    void keep_imatrix(int ncall) const;
 };

@@ -81,6 +82,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    if (ask) {
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
+        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
        return true;
@@ -101,16 +103,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
-        const int idx  = ((int32_t *) t->op_params)[0];
+        //   ids  -> [n_experts_used, n_tokens]
+        //   src1 -> [cols, n_expert_used, n_tokens]
        const ggml_tensor * ids = t->src[2];
        const int n_as = src0->ne[2];
+        const int n_ids = ids->ne[0];

        // the top-k selected expert ids are stored in the ids tensor
        // for simplicity, always copy ids to host, because it is small
        // take into account that ids is not contiguous!
-        GGML_ASSERT(ids->ne[1] == src1->ne[1]);
-        GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
-        m_ids.resize(ggml_nbytes(ids)/sizeof(int));
+
+        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
+
+        m_ids.resize(ggml_nbytes(ids));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));

        auto & e = m_stats[wname];
@@ -120,26 +125,35 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        //       using the following line, we can correct for that if needed by replacing the line above with:
        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0]*n_as, 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            exit(1); //GGML_ASSERT(false);
+        }
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
+        }
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];
-            if (e.values.empty()) {
-                e.values.resize(src1->ne[0]*n_as, 0);
-            }
-            else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
-                exit(1); //GGML_ASSERT(false);
-            }
-            if (m_params.verbosity > 1) {
-                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-            }
-            for (int row = 0; row < (int)src1->ne[1]; ++row) {
-                const int excur = m_ids[row*n_as + idx];
-                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
-                if (excur != ex) continue;
-                const float * x = data + row * src1->ne[0];
-                for (int j = 0; j < (int)src1->ne[0]; ++j) {
-                    e.values[e_start + j] += x[j]*x[j];
+
+            for (int idx = 0; idx < n_ids; ++idx) {
+                for (int row = 0; row < (int)src1->ne[2]; ++row) {
+                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
+
+                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+
+                    if (excur != ex) continue;
+
+                    const int64_t i11 = idx % src1->ne[1];
+                    const int64_t i12 = row;
+                    const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]);
+
+                    for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                        e.values[e_start + j] += x[j]*x[j];
+                    }
                }
            }
            if (e.ncall > m_last_call) {
@@ -186,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }

 void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
 }

 void IMatrixCollector::keep_imatrix(int ncall) const {
@@ -194,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str());
+    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
 }

-void IMatrixCollector::save_imatrix(const char * fname) const {
+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
+    out.write((const char *) &n_entries, sizeof(n_entries));
+    for (const auto & p : m_stats) {
        int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
+        out.write((const char *) &len, sizeof(len));
        out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
+        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
        int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+        out.write((const char *) &nval, sizeof(nval));
+        if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
    }
+
+    // Write the number of call the matrix was computed with
+    out.write((const char *) &m_last_call, sizeof(m_last_call));
+
+    // Write the dataset name at the end of the file to later on specify it in quantize
+    int n_dataset = strlen(dataset);
+    out.write((const char *) &n_dataset, sizeof(n_dataset));
+    out.write(dataset, n_dataset);
+
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
    }
 }

@@ -534,6 +557,29 @@ int main(int argc, char ** argv) {
        }
    }

+    gpt_params params;
+    params.n_batch = 512;
+    if (!gpt_params_parse(args.size(), args.data(), params)) {
+        return 1;
+    }
+
+    params.logits_all = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);
+
+    print_build_info();
+
+    if (params.seed == LLAMA_DEFAULT_SEED) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));

    if (!combine_files.empty()) {
@@ -572,49 +618,21 @@ int main(int argc, char ** argv) {
        }
    }

-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
-        return 1;
-    }
-
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
    llama_backend_init();
    llama_numa_init(params.numa);

-    llama_model_params mparams = llama_model_params_from_gpt_params(params);
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    llama_context_params cparams = llama_context_params_from_gpt_params(params);
-
    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
-    cparams.cb_eval = ik_collect_imatrix;
-    cparams.cb_eval_user_data = NULL;
+    params.cb_eval = ik_collect_imatrix;
+    params.cb_eval_user_data = NULL;
+    params.warmup = false;

-    llama_context * ctx = llama_new_context_with_model(model, cparams);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+    // init
+    llama_model * model;
+    llama_context * ctx;
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == nullptr || ctx == nullptr) {
+        fprintf(stderr, "%s : failed to init\n", __func__);
        return 1;
    }

--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi

 ### Example

+Download a model that supports infill, for example CodeLlama:
+```console
+scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models
+```
+
 ```bash
 ./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
 ```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -586,7 +586,7 @@ int main(int argc, char ** argv) {

            // deal with eot token in infill mode
            if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
-                if(is_interacting && !params.interactive_first) {
+                if (is_interacting && !params.interactive_first) {
                    // print an eot token
                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                }
@@ -651,8 +651,8 @@ int main(int argc, char ** argv) {
                // LOG_TEE("took new input\n");
                is_interacting = false;
            }
-            // deal with end of text token in interactive mode
-            else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            // deal with end of generation tokens in interactive mode
+            else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -731,8 +731,8 @@ int main(int argc, char ** argv) {
            }
        }

-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
+        // end of generation
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
            break;
        }

--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,37 +6,94 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union

+def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+    if not separator_rule:
+        if min_items == 0 and max_items == 1:
+            return f'{item_rule}?'
+        elif min_items == 1 and max_items is None:
+            return f'{item_rule}+'
+
+    result = ''
+
+    if min_items > 0:
+        if item_rule_is_literal and separator_rule is None:
+            result = '"' + (item_rule[1:-1] * min_items) + '"'
+        else:
+            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+
+    def opt_repetitions(up_to_n, prefix_with_sep=False):
+        '''
+            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
+            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
+            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
+        '''
+
+        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
+        if up_to_n == 0:
+            return ''
+        elif up_to_n == 1:
+            return f'({content})?'
+        elif separator_rule and not prefix_with_sep:
+            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
+        else:
+            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
+
+    if min_items > 0 and max_items != min_items:
+        result += ' '
+
+    if max_items is not None:
+        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
+    else:
+        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
+
+        if min_items == 0 and separator_rule:
+            result = f'({item_rule} {item_operator}*)?'
+        else:
+            result += f'{item_operator}*'
+
+    return result
+
+
+class BuiltinRule:
+    def __init__(self, content: str, deps: list = None):
+        self.content = content
+        self.deps = deps or []
+
+_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
+
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'

 PRIMITIVE_RULES = {
-    'boolean': '("true" | "false") space',
-    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
-    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
-    'value'  : 'object | array | string | number | boolean',
-    'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
-    'array'  : '"[" space ( value ("," space value)* )? "]" space',
-    'uuid'   : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
-    'string': r''' "\"" (
-        [^"\\] |
-        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space''',
-    'null': '"null" space',
+    'boolean'      : BuiltinRule('("true" | "false") space', []),
+    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
+    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+    'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+    'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
+    'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+    'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+    'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+    'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
+    'null'         : BuiltinRule('"null" space', []),
 }
-OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']

 # TODO: support "uri", "email" string formats
-DATE_RULES = {
-    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
-    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
-    'date-time': 'date "T" time',
-    'date-string': '"\\"" date "\\"" space',
-    'time-string': '"\\"" time "\\"" space',
-    'date-time-string': '"\\"" date-time "\\"" space',
+STRING_FORMAT_RULES = {
+    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
+    'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
+    'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
+    'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
 }

-RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])
+DOTALL = '[\\U00000000-\\U0010FFFF]'
+DOT = '[^\\x0A\\x0D]'
+
+RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()])

 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
 GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
@@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']'
 NON_LITERAL_SET = set('|.()[]{}*+?')
 ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')

-DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
-TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits

 class SchemaConverter:
    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
@@ -55,7 +110,9 @@ class SchemaConverter:
        self._allow_fetch = allow_fetch
        self._dotall = dotall
        self._raw_pattern = raw_pattern
-        self._rules = {'space': SPACE_RULE}
+        self._rules = {
+            'space': SPACE_RULE,
+        }
        self._refs = {}
        self._refs_being_resolved = set()

@@ -65,6 +122,29 @@ class SchemaConverter:
        )
        return f'"{escaped}"'

+    def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str:
+        '''
+            not_literal('a') -> '[^a]'
+            not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?'
+        '''
+        assert len(literal) > 0, 'Empty literal not supported'
+        def recurse(i: int):
+            c = literal[i]
+            if maybe_escaped_underscores and c == '_':
+                yield f'[^{c}\\\\]'
+                yield ' | '
+                yield f'"\\\\"? "{c}"'
+            else:
+                yield f'[^{c}]'
+            if i < len(literal) - 1:
+                yield ' | '
+                yield self._format_literal(c)
+                yield ' ('
+                yield from recurse(i + 1)
+                yield ')?'
+
+        return ''.join(('(', *recurse(0), ')'))
+
    def _add_rule(self, name, rule):
        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
        if esc_name not in self._rules or self._rules[esc_name] == rule:
@@ -169,10 +249,10 @@ class SchemaConverter:

            def get_dot():
                if self._dotall:
-                    rule = '[\\U00000000-\\U0010FFFF]'
+                    rule = DOTALL
                else:
                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
-                    rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
+                    rule = DOT
                return self._add_rule(f'dot', rule)

            def join_seq():
@@ -246,26 +326,14 @@ class SchemaConverter:

                    (sub, sub_is_literal) = seq[-1]

-                    if min_times == 0 and max_times is None:
-                        seq[-1] = (f'{sub}*', False)
-                    elif min_times == 0 and max_times == 1:
-                        seq[-1] = (f'{sub}?', False)
-                    elif min_times == 1 and max_times is None:
-                        seq[-1] = (f'{sub}+', False)
-                    else:
-                        if not sub_is_literal:
-                            id = sub_rule_ids.get(sub)
-                            if id is None:
-                                id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
-                                sub_rule_ids[sub] = id
-                            sub = id
+                    if not sub_is_literal:
+                        id = sub_rule_ids.get(sub)
+                        if id is None:
+                            id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+                            sub_rule_ids[sub] = id
+                        sub = id

-                        seq[-1] = (
-                            ' '.join(
-                                ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
-                                ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
-                            False
-                        )
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
                else:
                    literal = ''
                    while i < length:
@@ -373,49 +441,47 @@ class SchemaConverter:
                    ' "]" space')
            else:
                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
-                list_item_operator = f'( "," space {item_rule_name} )'
-                successive_items = ""
                min_items = schema.get("minItems", 0)
                max_items = schema.get("maxItems")
-                if min_items > 0:
-                    successive_items = list_item_operator * (min_items - 1)
-                    min_items -= 1
-                if max_items is not None and max_items > min_items:
-                    successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
-                else:
-                    successive_items += list_item_operator + "*"
-                if min_items == 0:
-                    rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
-                else:
-                    rule = f'"[" space {item_rule_name} {successive_items} "]" space'
-                return self._add_rule(rule_name, rule)
+                return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space')

        elif schema_type in (None, 'string') and 'pattern' in schema:
            return self._visit_pattern(schema['pattern'], rule_name)

        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
-            return self._add_rule(
+            return self._add_primitive(
                'root' if rule_name == 'root' else schema_format,
                PRIMITIVE_RULES['uuid']
            )

-        elif schema_type in (None, 'string') and schema_format in DATE_RULES:
-            for t, r in DATE_RULES.items():
-                self._add_rule(t, r)
-            return schema_format + '-string'
+        elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES:
+            prim_name = f'{schema_format}-string'
+            return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name]))
+
+        elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema):
+            char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
+            min_len = schema.get('minLength', 0)
+            max_len = schema.get('maxLength')
+
+            return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')

        elif (schema_type == 'object') or (len(schema) == 0):
-            for n in OBJECT_RULE_NAMES:
-                self._add_rule(n, PRIMITIVE_RULES[n])
-            return self._add_rule(rule_name, 'object')
+            return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))

        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-            return self._add_rule(
-                'root' if rule_name == 'root' else schema_type,
-                PRIMITIVE_RULES[schema_type]
-            )
+            return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type])
+
+    def _add_primitive(self, name: str, rule: BuiltinRule):
+        n = self._add_rule(name, rule.content)
+
+        for dep in rule.deps:
+            dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep)
+            assert dep_rule, f'Rule {dep} not known'
+            if dep not in self._rules:
+                self._add_primitive(dep, dep_rule)
+        return n

    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
        prop_order = self._prop_order
@@ -437,7 +503,7 @@ class SchemaConverter:
            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
            prop_kv_rule_names["*"] = self._add_rule(
                f'{sub_name}-kv',
-                self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+                self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
            )
            optional_props.append("*")

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -190,7 +190,7 @@ static const cmd_params cmd_params_defaults = {
    /* n_ubatch      */ {512},
    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {get_num_physical_cores()},
+    /* n_threads     */ {get_math_cpu_count()},
    /* n_gpu_layers  */ {99},
    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
    /* main_gpu      */ {0},
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);

    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return env->NewStringUTF("");
    }

--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -158,7 +158,7 @@ actor LlamaContext {
            new_token_id = llama_sample_token_greedy(context, &candidates_p)
        }

-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            let new_token_str = String(cString: temporary_invalid_cchars + [0])
            temporary_invalid_cchars.removeAll()
@@ -322,7 +322,7 @@ actor LlamaContext {
        defer {
            result.deallocate()
        }
-        let nTokens = llama_token_to_piece(model, token, result, 8)
+        let nTokens = llama_token_to_piece(model, token, result, 8, false)

        if nTokens < 0 {
            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
@@ -330,7 +330,7 @@ actor LlamaContext {
            defer {
                newResult.deallocate()
            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
+            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
            return Array(bufferPointer)
        } else {
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example:

 ## Model conversion

- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
+1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:

 ```sh
 git clone https://huggingface.co/mtgv/MobileVLM-1.7B
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example:

 ## LLaVA 1.5

- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:

 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,6 +3,7 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
+#include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -23,7 +24,6 @@
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
-#include <iostream>
 #include <map>
 #include <regex>
 #include <stdexcept>
@@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
+#define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@@ -145,7 +146,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
    int i = gguf_find_key(ctx, key);
    if (i == -1) {
-        fprintf(stderr, "key %s not found in file\n", key);
+        LOG_TEE("key %s not found in file\n", key);
        throw std::runtime_error(format("Missing required key: %s", key));
    }

@@ -247,7 +248,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
    size_t tensor_size = ggml_nbytes(tensor);
-    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -265,7 +266,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -284,7 +285,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
    if (!file.is_open()) {
-        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
        return;
    }

@@ -425,6 +426,7 @@ struct clip_vision_model {
    // embeddings
    struct ggml_tensor * class_embedding;
    struct ggml_tensor * patch_embeddings;
+    struct ggml_tensor * patch_bias;
    struct ggml_tensor * position_embeddings;

    struct ggml_tensor * pre_ln_w;
@@ -501,6 +503,11 @@ struct clip_ctx {
    bool use_gelu = false;
    int32_t ftype = 1;

+    bool has_class_embedding = true;
+    bool has_pre_norm = true;
+    bool has_post_norm = false;
+    bool has_patch_bias = false;
+
    struct gguf_context * ctx_gguf;
    struct ggml_context * ctx_data;

@@ -515,7 +522,7 @@ struct clip_ctx {

 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
    }

@@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + 1;
+    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
@@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

+    if (ctx->has_patch_bias) {
+        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+    }
+
    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    struct ggml_tensor * embeddings = inp;
+    if (ctx->has_class_embedding) {
+        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+        embeddings = ggml_acc(ctx0, embeddings, inp,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    }
    ggml_set_name(embeddings, "embeddings");
    ggml_set_input(embeddings);

-    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-
-    embeddings = ggml_acc(ctx0, embeddings, inp,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
@@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));

    // pre-layernorm
-    {
+    if (ctx->has_pre_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");

@@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = cur;
    }

+    // post-layernorm
+    if (ctx->has_post_norm) {
+        embeddings = ggml_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "post_ln");
+
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
+    }
+
    // llava projector
    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@@ -879,21 +901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
-            printf("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
        }
-        printf("%s: description:  %s\n", __func__, description.c_str());
-        printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        printf("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        printf("%s: n_tensors:    %d\n", __func__, n_tensors);
-        printf("%s: n_kv:         %d\n", __func__, n_kv);
-        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        printf("\n");
+        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
+        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_TEE("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
-    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
        std::map<enum ggml_type, uint32_t> n_type;
@@ -904,7 +926,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            n_type[type]++;
        }

-        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
        for (int i = 0; i < n_kv; i++) {
            const char * name           = gguf_get_key(ctx, i);
            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -920,7 +942,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            }
            replace_all(value, "\n", "\\n");

-            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
        }

        // print type counts
@@ -929,7 +951,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                continue;
            }

-            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
        }
    }

@@ -944,7 +966,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            size_t tensor_size = ggml_nbytes(cur);
            model_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
            }
        }
@@ -971,18 +993,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

 #ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
-    printf("%s: CLIP using CUDA backend\n", __func__);
+    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
 #endif

 #ifdef GGML_USE_METAL
    new_clip->backend = ggml_backend_metal_init();
-    printf("%s: CLIP using Metal backend\n", __func__);
+    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
 #endif


    if (!new_clip->backend) {
        new_clip->backend = ggml_backend_cpu_init();
-        printf("%s: CLIP using CPU backend\n", __func__);
+        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
    }

    // model size and capabilities
@@ -1006,15 +1028,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->use_gelu = gguf_get_val_bool(ctx, idx);

        if (verbosity >= 1) {
-            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            printf("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);

    // load tensors
    {
@@ -1027,7 +1049,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        new_clip->ctx_data = ggml_init(params);
        if (!new_clip->ctx_data) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            LOG_TEE("%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1035,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

        auto fin = std::ifstream(fname, std::ios::binary);
        if (!fin) {
-            printf("cannot open model file for loading tensors\n");
+            LOG_TEE("cannot open model file for loading tensors\n");
            clip_free(new_clip);
            gguf_free(ctx);
            return nullptr;
@@ -1057,7 +1079,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
-                printf("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
                clip_free(new_clip);
                gguf_free(ctx);
                return nullptr;
@@ -1128,34 +1150,61 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }

        if (verbosity >= 2) {
-            printf("\n%s: vision model hparams\n", __func__);
-            printf("image_size         %d\n", hparams.image_size);
-            printf("patch_size         %d\n", hparams.patch_size);
-            printf("v_hidden_size      %d\n", hparams.hidden_size);
-            printf("v_n_intermediate   %d\n", hparams.n_intermediate);
-            printf("v_projection_dim   %d\n", hparams.projection_dim);
-            printf("v_n_head           %d\n", hparams.n_head);
-            printf("v_n_layer          %d\n", hparams.n_layer);
-            printf("v_eps              %f\n", hparams.eps);
-            printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            printf("v_image_grid_pinpoints: ");
+            LOG_TEE("\n%s: vision model hparams\n", __func__);
+            LOG_TEE("image_size         %d\n", hparams.image_size);
+            LOG_TEE("patch_size         %d\n", hparams.patch_size);
+            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_TEE("v_n_head           %d\n", hparams.n_head);
+            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
+            LOG_TEE("v_eps              %f\n", hparams.eps);
+            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_TEE("v_image_grid_pinpoints: ");
            for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                printf("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
            }
-            printf("\n");
-            printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_TEE("\n");
+            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);

        }

+        try {
+            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            new_clip->has_class_embedding = true;
+        } catch (const std::exception& e) {
+            new_clip->has_class_embedding = false;
+        }
+
+        try {
+            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+            new_clip->has_pre_norm = true;
+        } catch (std::exception & e) {
+            new_clip->has_pre_norm = false;
+        }
+
+        try {
+            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
+            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
+            new_clip->has_post_norm = true;
+        } catch (std::exception & e) {
+            new_clip->has_post_norm = false;
+        }
+
+        try {
+            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
+            new_clip->has_patch_bias = true;
+        } catch (std::exception & e) {
+            new_clip->has_patch_bias = false;
+        }
+
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
        } catch(const std::exception& e) {
-            fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
+            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }

        // LLaVA projection
@@ -1184,7 +1233,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            } catch (std::runtime_error & e) {  }
            try {
                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
            } catch (std::runtime_error & e) {  }
        } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
            // MobileVLM projection
@@ -1264,7 +1313,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }

    return new_clip;
@@ -1304,7 +1353,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
+        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1316,7 +1365,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
    int nx, ny, nc;
    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
    if (!data) {
-        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
+        LOG_TEE("%s: failed to decode image bytes\n", __func__);
        return false;
    }
    build_clip_img_from_data(data, nx, ny, img);
@@ -1325,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
 }

 // Linear interpolation between two points
-inline float lerp(float s, float e, float t) {
+inline float clip_lerp(float s, float e, float t) {
    return s + (e - s) * t;
 }
 // Bilinear resize function
@@ -1347,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
            float y_lerp = py - y_floor;

            for (int c = 0; c < 3; c++) {
-                float top = lerp(
+                float top = clip_lerp(
                    static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
                    static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
                    x_lerp
                );
-                float bottom = lerp(
+                float bottom = clip_lerp(
                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
                    x_lerp
                );
-                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
            }
        }
    }
@@ -1506,7 +1555,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -1545,7 +1594,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }
    auto & params = ctx->vision_model.hparams;
@@ -1622,7 +1671,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            }

            for (size_t i = 0; i < patches.size(); i++) {
-                // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                clip_image_u8_free(patches[i]);
            }

@@ -1765,7 +1814,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {

 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -1777,7 +1826,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3

 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
-        printf("This gguf file seems to have no vision encoder\n");
+        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return false;
    }

@@ -1939,7 +1988,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            new_type = type;
            if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
            }
            const size_t n_elms = ggml_nelements(cur);
            float * f32_data;
@@ -1958,7 +2007,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                f32_data = (float *)conv_buf.data();
                break;
            default:
-                printf("Please use an input file in f32 or f16\n");
+                LOG_TEE("Please use an input file in f32 or f16\n");
                gguf_free(ctx_out);
                return false;
            }
@@ -1985,7 +2034,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -2001,8 +2050,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
    }

    return true;
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "log.h"
 #include "common.h"
 #include "clip.h"
 #include "llava.h"
@@ -18,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
@@ -45,7 +46,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
@@ -73,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
    size_t img_base64_str_start, img_base64_str_end;
    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
        return NULL;
    }

@@ -87,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip

    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
    if (!embed) {
-        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
+        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
        return NULL;
    }

@@ -112,8 +113,8 @@ struct llava_context {
 };

 static void show_additional_info(int /*argc*/, char ** argv) {
-    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
@@ -123,18 +124,18 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
    auto prompt = params->prompt;
    if (prompt_contains_image(prompt)) {
        if (!params->image.empty()) {
-            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
+            LOG_TEE("using base64 encoded image instead of command line image path\n");
        }
        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
        if (!embed) {
-            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
+            LOG_TEE("%s: can't load image from prompt\n", __func__);
            return NULL;
        }
        params->prompt = remove_image_from_prompt(prompt);
    } else {
        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
+            LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
            return NULL;
        }
    }
@@ -153,18 +154,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        printf("system_prompt: %s\n", system_prompt.c_str());
+        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
-        printf("user_prompt: %s\n", user_prompt.c_str());
+        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    } else {
@@ -174,7 +175,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
        if (params->verbose_prompt) {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
            for (int i = 0; i < (int) tmp.size(); i++) {
-                printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
            }
        }
    }
@@ -185,7 +186,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_

    // generate the response

-    fprintf(stderr, "\n");
+    LOG_TEE("\n");

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
    std::string response = "";
@@ -224,7 +225,7 @@ static struct llava_context * llava_init(gpt_params * params) {

    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }

@@ -234,7 +235,7 @@ static struct llava_context * llava_init(gpt_params * params) {
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

    if (ctx_llama == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }

@@ -257,6 +258,12 @@ static void llava_free(struct llava_context * ctx_llava) {
    llama_backend_free();
 }

+static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    LOG_TEE("%s", text);
+}
+
 int main(int argc, char ** argv) {
    ggml_time_init();

@@ -266,6 +273,14 @@ int main(int argc, char ** argv) {
        show_additional_info(argc, argv);
        return 1;
    }
+
+#ifndef LOG_DISABLE_LOGS
+    log_set_target(log_filename_generator("llava", "log"));
+    LOG_TEE("Log start\n");
+    log_dump_cmdline(argc, argv);
+    llama_log_set(llama_log_callback_logTee, nullptr);
+#endif // LOG_DISABLE_LOGS
+
    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        gpt_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
@@ -274,7 +289,7 @@ int main(int argc, char ** argv) {

    auto ctx_llava = llava_init(&params);
    if (ctx_llava == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
+        LOG_TEE("%s: error: failed to init llava\n", __func__);
        return 1;
    }

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -54,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
-        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
@@ -154,13 +154,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
    if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
        if (newline_tmp->buffer == NULL) {
-            printf("newline_tmp tensor buffer is NULL\n");
+            LOG_TEE("newline_tmp tensor buffer is NULL\n");
        }
        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
    } else {
        model.newline->data = newline_tmp->data;
        if (model.newline->data == NULL) {
-            printf("newline_tmp tensor data is NULL\n");
+            LOG_TEE("newline_tmp tensor data is NULL\n");
        }
    }

@@ -224,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    img_res_v.size = 0;
    img_res_v.data = nullptr;
    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
    }
@@ -239,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
        delete[] img_res_v.data;
        if (!encoded) {
-            fprintf(stderr, "Unable to encode image\n");
+            LOG_TEE("Unable to encode image\n");

            return false;
        }
@@ -252,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);

@@ -290,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
    }

-    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

    const int64_t t_img_enc_end_us = ggml_time_us();
    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

-    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

    return true;
 }
@@ -305,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
    if (n_image_embd != n_llama_embd) {
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
        return false;
    }
    return true;
@@ -314,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
    if (!image_embd) {
-        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
        return false;
    }

    int n_img_pos;
    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
        free(image_embd);
        return false;
    }
@@ -340,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
        }
        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
        if (llama_decode(ctx_llama, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
+            LOG_TEE("%s : failed to eval\n", __func__);
            return false;
        }
        *n_past += n_eval;
@@ -352,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    clip_image_u8 * img = clip_image_u8_init();
    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
        clip_image_u8_free(img);
-        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
        return NULL;
    }

@@ -361,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
    if (!image_embed_result) {
        clip_image_u8_free(img);
-        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
+        LOG_TEE("%s: coulnd't embed the image\n", __func__);
        return NULL;
    }

@@ -375,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
    auto file = fopen(path, "rb");
    if (file == NULL) {
-        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
        return false;
    }

@@ -385,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long

    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
    if (buffer == NULL) {
-        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
        perror("Memory allocation error");
        fclose(file);
        return false;
@@ -410,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
    long image_bytes_length;
    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
    if (!loaded) {
-        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
        return NULL;
    }

--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
                }
                fflush(stdout);

-                if (id == llama_token_eos(model)) {
+                if (llama_token_is_eog(model, id)) {
                    has_eos = true;
                }

--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -30,7 +30,6 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -38,7 +38,6 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    llama_set_rng_seed(ctx, params.seed);
    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
@@ -141,7 +140,7 @@ int main(int argc, char ** argv){
                printf("%s", token_str.c_str());
            }

-            if (id == llama_token_eos(model)) {
+            if (llama_token_is_eog(model, id)) {
                has_eos = true;
            }

--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -304,13 +304,15 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

-### Grammars
+### Grammars & JSON schemas

 -   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.

+-   `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead.
+
 ### Quantization

-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize).

 ## Additional Options

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
                return 1;
            }
            session_tokens.resize(n_token_count_out);
-            llama_set_rng_seed(ctx, params.seed);
            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
        }
    }
@@ -795,8 +794,8 @@ int main(int argc, char ** argv) {
                }
            }

-            // deal with end of text token in interactive mode
-            if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
+            // deal with end of generation tokens in interactive mode
+            if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
                LOG("found EOS token\n");

                if (params.interactive) {
@@ -920,8 +919,8 @@ int main(int argc, char ** argv) {
            }
        }

-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
+        // end of generation
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
            LOG_TEE(" [end of text]\n");
            break;
        }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -359,7 +359,7 @@ int main(int argc, char ** argv) {
                //        client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());

                if (client.n_decoded > 2 &&
-                        (id == llama_token_eos(model) ||
+                        (llama_token_is_eog(model, id) ||
                         (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
                         client.response.find("User:") != std::string::npos ||
                         client.response.find('\n') != std::string::npos)) {
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");

                break;
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -3,19 +3,18 @@
 TODO

 ## Llama 2 70B Scorechart
-Quantization | Model size (GiB) | Perplexity | Delta to fp16
-- | -- | -- | --
-Q4_0 | 36.20 | 3.5550 | 3.61%
-Q4_1 | 40.20 | 3.5125 | 2.37%
-Q5_0 | 44.20 | 3.4744 | 1.26%
-Q2_K | 27.27 | 3.7339 | 8.82%
-Q3_K_S | 27.86 | 3.7019 | 7.89%
-Q3_K_M | 30.83 | 3.5932 | 4.72%
-Q3_K_L | 33.67 | 3.5617 | 3.80%
-Q4_K_S | 36.39 | 3.4852 | 1.57%
-Q4_K_M | 38.54 | 3.4725 | 1.20%
-Q5_K_S | 44.20 | 3.4483 | 0.50%
-Q5_K_M | 45.41 | 3.4451 | 0.40%
-Q6_K | 52.70 | 3.4367 | 0.16%
-fp16 | 128.5 | 3.4313 | -
-
+| Quantization | Model size (GiB) | Perplexity | Delta to fp16 |
+|--------------|------------------|------------|---------------|
+| Q4_0         | 36.20            | 3.5550     | 3.61%         |
+| Q4_1         | 40.20            | 3.5125     | 2.37%         |
+| Q5_0         | 44.20            | 3.4744     | 1.26%         |
+| Q2_K         | 27.27            | 3.7339     | 8.82%         |
+| Q3_K_S       | 27.86            | 3.7019     | 7.89%         |
+| Q3_K_M       | 30.83            | 3.5932     | 4.72%         |
+| Q3_K_L       | 33.67            | 3.5617     | 3.80%         |
+| Q4_K_S       | 36.39            | 3.4852     | 1.57%         |
+| Q4_K_M       | 38.54            | 3.4725     | 1.20%         |
+| Q5_K_S       | 44.20            | 3.4483     | 0.50%         |
+| Q5_K_M       | 45.41            | 3.4451     | 0.40%         |
+| Q6_K         | 52.70            | 3.4367     | 0.16%         |
+| fp16         | 128.5            | 3.4313     | -             |
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1852,12 +1852,20 @@ int main(int argc, char ** argv) {

    const int32_t n_ctx = params.n_ctx;

+    if (n_ctx <= 0) {
+        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        return 1;
+    }
+
    const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence;
+
    if (ppl) {
-        int n_seq = std::max(1, params.n_batch / n_ctx);
-        int32_t n_kv = n_seq * n_ctx;
+        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
+        const int32_t n_kv = n_seq * n_ctx;
+
        params.n_parallel = n_seq;
-        params.n_ctx = n_kv;
+        params.n_ctx      = n_kv;
+
        params.n_batch = std::min(params.n_batch, n_kv);
    } else {
        params.n_batch = std::min(params.n_batch, params.n_ctx);
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -4,17 +4,17 @@ TODO

 ## Llama 2 7B

-Quantization | Bits per Weight (BPW)
-- | --
-Q2_K | 3.35
-Q3_K_S | 3.50
-Q3_K_M | 3.91
-Q3_K_L | 4.27
-Q4_K_S | 4.58
-Q4_K_M | 4.84
-Q5_K_S | 5.52
-Q5_K_M | 5.68
-Q6_K | 6.56
+| Quantization | Bits per Weight (BPW) |
+|--------------|-----------------------|
+| Q2_K         | 3.35                  |
+| Q3_K_S       | 3.50                  |
+| Q3_K_M       | 3.91                  |
+| Q3_K_L       | 4.27                  |
+| Q4_K_S       | 4.58                  |
+| Q4_K_M       | 4.84                  |
+| Q5_K_S       | 5.52                  |
+| Q5_K_M       | 5.68                  |
+| Q6_K         | 6.56                  |

 ## Llama 2 13B
 Quantization | Bits per Weight (BPW)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -8,7 +8,6 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
-#include <algorithm>

 struct quant_option {
    std::string name;
@@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };

+static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
+static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";

 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;
@@ -97,6 +100,7 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --keep-split: will generate quatized model in the same shards as input");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -112,7 +116,7 @@ static void usage(const char * executable) {
    exit(1);
 }

-static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@@ -159,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
        }
    }
-    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
+
+    // latest imatrix version contains the dataset filename at the end of the file
+    int m_last_call = 0;
+    if (in.peek() != EOF) {
+        in.read((char *)&m_last_call, sizeof(m_last_call));
+        int dataset_len;
+        in.read((char *)&dataset_len, sizeof(dataset_len));
+        std::vector<char> dataset_as_vec(dataset_len);
+        in.read(dataset_as_vec.data(), dataset_len);
+        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
+        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
+    }
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
+    return m_last_call;
 }

-static void prepare_imatrix(const std::string & imatrix_file,
+static int prepare_imatrix(const std::string & imatrix_file,
+        std::string & imatrix_dataset,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+    int m_last_call = -1;
    if (!imatrix_file.empty()) {
-        load_imatrix(imatrix_file, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
    }
    if (imatrix_data.empty()) {
-        return;
+        return m_last_call;
    }
    if (!excluded_weights.empty()) {
        for (auto& name : excluded_weights) {
@@ -196,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
    if (!imatrix_data.empty()) {
        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
    }
+    return m_last_call;
 }

 static ggml_type parse_ggml_type(const char * arg) {
@@ -210,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
    return result;
 }

-static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
-    const char* sep = strchr(data, '=');
-    if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
-        return false;
-    }
-    llama_model_kv_override kvo;
-    std::strncpy(kvo.key, data, sep - data);
-    kvo.key[sep - data] = 0;
-    sep++;
-    if (strncmp(sep, "int:", 4) == 0) {
-        sep += 4;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-        kvo.int_value = std::atol(sep);
-    } else if (strncmp(sep, "float:", 6) == 0) {
-        sep += 6;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-        kvo.float_value = std::atof(sep);
-    } else if (strncmp(sep, "bool:", 5) == 0) {
-        sep += 5;
-        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-        if (std::strcmp(sep, "true") == 0) {
-            kvo.bool_value = true;
-        } else if (std::strcmp(sep, "false") == 0) {
-            kvo.bool_value = false;
-        } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
-            return false;
-        }
-    } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
-        return false;
-    }
-    overrides.emplace_back(std::move(kvo));
-    return true;
-}
-
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -300,6 +283,8 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
+        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+            params.keep_split = true;
        } else {
            usage(argv[0]);
        }
@@ -313,10 +298,43 @@ int main(int argc, char ** argv) {
        usage(argv[0]);
    }

+    std::string imatrix_dataset;
    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+        if (!imatrix_dataset.empty()) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
+            kvo.val_str[127] = '\0';
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = imatrix_data.size();
+            kv_overrides.emplace_back(std::move(kvo));
+        }
+
+        if (m_last_call > 0) {
+            llama_model_kv_override kvo;
+            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
+            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+            kvo.val_i64 = m_last_call;
+            kv_overrides.emplace_back(std::move(kvo));
+        }
    }
    if (!kv_overrides.empty()) {
        kv_overrides.emplace_back();
@@ -332,20 +350,28 @@ int main(int argc, char ** argv) {
    std::string fname_out;

    std::string ftype_str;
+    std::string suffix = ".gguf";
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
        const size_t pos = fname_inp.find_last_of("/\\");
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
        arg_idx++;
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
    } else {
        fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
        arg_idx++;

        if (argc <= arg_idx) {
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+    echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+    echo "example: $0 ../../build/bin ../../tmp"
+    exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+    TMP_DIR=$2
+else
+    TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+QUANTIZE=$1/quantize
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/quantize
+ROOT_DIR=$(realpath $(dirname $0)/../../)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+cd $WORK_PATH
+"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep_split'
+$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep_split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@@ -8,7 +8,7 @@ print(subprocess.check_output(
        "python",
        os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
-        "json-schema-to-grammar.py"),
+        "json_schema_to_grammar.py"),
        *rest,
        "-",
        "--raw-pattern",
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,17 +1,34 @@
 set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET}
+include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+set(TARGET_SRCS
    server.cpp
    utils.hpp
    httplib.h
 )
+set(PUBLIC_ASSETS
+    index.html
+    index.js
+    completion.js
+    json-schema-to-grammar.mjs
+)
+foreach(asset ${PUBLIC_ASSETS})
+    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    list(APPEND TARGET_SRCS ${output})
+    add_custom_command(
+        DEPENDS "${input}"
+        OUTPUT "${output}"
+        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
+    )
+endforeach()
+add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -11,6 +11,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
 * Continuous batching
 * Multimodal (wip)
 * Monitoring endpoints
+ * Schema-constrained JSON response format

 The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).

@@ -250,6 +251,8 @@ node index.js

    `grammar`: Set grammar for grammar-based sampling.  Default: no grammar

+    `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features.  Default: no JSON schema.
+
    `seed`: Set the random number generator (RNG) seed.  Default: `-1`, which is a random seed.

    `ignore_eos`: Ignore end of stream token and continue generating.  Default: `false`
@@ -365,6 +368,8 @@ Notice that each `probs` is an array of length `n_probs`.

    See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.

+    The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers.
+
    *Examples:*

    You can use either Python `openai` library with appropriate checkpoints:
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -90,7 +90,8 @@ export default function () {
        "model": model,
        "stream": true,
        "seed": 42,
-        "max_tokens": max_tokens
+        "max_tokens": max_tokens,
+        "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
    }

    const params = {method: 'POST', body: JSON.stringify(payload)};
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -1,496 +0,0 @@
-unsigned char completion_js[] = {
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44,
-  0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72,
-  0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
-  0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20,
-  0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
-  0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
-  0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
-  0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
-  0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
-  0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
-  0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
-  0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
-  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
-  0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
-  0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
-  0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
-  0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
-  0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
-  0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
-  0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
-  0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
-  0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
-  0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
-  0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
-  0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d,
-  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f,
-  0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
-  0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e,
-  0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75,
-  0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
-  0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60, 0x24,
-  0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x60, 0x2c, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a,
-  0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e,
-  0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61,
-  0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f,
-  0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c,
-  0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70,
-  0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70,
-  0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76,
-  0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65,
-  0x79, 0x20, 0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72,
-  0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42,
-  0x65, 0x61, 0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60,
-  0x7d, 0x20, 0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61,
-  0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
-  0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20,
-  0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
-  0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65,
-  0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e,
-  0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
-  0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54,
-  0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29,
-  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
-  0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42,
-  0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61,
-  0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
-  0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20,
-  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66,
-  0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e,
-  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64,
-  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c,
-  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65,
-  0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75,
-  0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72,
-  0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e,
-  0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72,
-  0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65,
-  0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e,
-  0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
-  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78,
-  0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27,
-  0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
-  0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74,
-  0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64,
-  0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65,
-  0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69,
-  0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70,
-  0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
-  0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69,
-  0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74,
-  0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74,
-  0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
-  0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65,
-  0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f,
-  0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65,
-  0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
-  0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20,
-  0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20,
-  0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74,
-  0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e,
-  0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29,
-  0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73,
-  0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63,
-  0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63,
-  0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
-  0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63,
-  0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68,
-  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a,
-  0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74,
-  0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64,
-  0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a,
-  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
-  0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
-  0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
-  0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
-  0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73,
-  0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
-  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e,
-  0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20,
-  0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
-  0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a,
-  0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65,
-  0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73,
-  0x61, 0x67, 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73,
-  0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61,
-  0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77,
-  0x20, 0x61, 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f,
-  0x20, 0x62, 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62,
-  0x79, 0x20, 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63,
-  0x61, 0x6c, 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74,
-  0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72,
-  0x6f, 0x72, 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61,
-  0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e,
-  0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24,
-  0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x2e, 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72,
-  0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e,
-  0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28,
-  0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f,
-  0x72, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e,
-  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62,
-  0x6f, 0x72, 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f,
-  0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22,
-  0x2c, 0x20, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b,
-  0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c,
-  0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72,
-  0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61,
-  0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65,
-  0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e,
-  0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61,
-  0x74, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75,
-  0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65,
-  0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
-  0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f,
-  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61,
-  0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65,
-  0x6e, 0x65, 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65,
-  0x22, 0x2c, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
-  0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74,
-  0x61, 0x69, 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79,
-  0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
-  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f,
-  0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c,
-  0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64,
-  0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67,
-  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61,
-  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
-  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
-  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x22, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
-  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
-  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
-  0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73,
-  0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e,
-  0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22,
-  0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
-  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
-  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
-  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
-  0x22, 0x64, 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
-  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x7d, 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
-  0x67, 0x65, 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43,
-  0x61, 0x6c, 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73,
-  0x6f, 0x6c, 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f,
-  0x65, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f,
-  0x72, 0x74, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67,
-  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
-  0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69,
-  0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74,
-  0x68, 0x65, 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74,
-  0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x6f, 0x72, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e,
-  0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
-  0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66,
-  0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e,
-  0x65, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61,
-  0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
-  0x65, 0x2c, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d,
-  0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
-  0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61,
-  0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67,
-  0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
-  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
-  0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e,
-  0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
-  0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65,
-  0x63, 0x74, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d,
-  0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64,
-  0x65, 0x70, 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20,
-  0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
-  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e,
-  0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63,
-  0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63,
-  0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69,
-  0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75,
-  0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28,
-  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70,
-  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b,
-  0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20,
-  0x7d, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61,
-  0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f,
-  0x20, 0x47, 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64,
-  0x65, 0x6c, 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e,
-  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65,
-  0x66, 0x75, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61,
-  0x6e, 0x64, 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78,
-  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66,
-  0x6f, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20,
-  0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x61, 0x70, 0x69,
-  0x5f, 0x75, 0x72, 0x6c, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69,
-  0x67, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x20, 0x7c, 0x7c,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, 0x3d, 0x20, 0x61,
-  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x60,
-  0x24, 0x7b, 0x61, 0x70, 0x69, 0x5f, 0x75, 0x72, 0x6c, 0x7d, 0x2f, 0x70,
-  0x72, 0x6f, 0x70, 0x73, 0x60, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
-  0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28,
-  0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e,
-  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65,
-  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
-  0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72,
-  0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
-  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
-  0x73, 0x3b, 0x0a, 0x7d, 0x0a
-};
-unsigned int completion_js_len = 5909;
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -8,13 +8,3 @@ PUBLIC=$DIR/public
 echo "download js bundle files"
 curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
 echo >> $PUBLIC/index.js # add newline
-
-FILES=$(ls $PUBLIC)
-
-cd $PUBLIC
-for FILE in $FILES; do
-  echo "generate $FILE.hpp"
-
-  # use simple flag for old version of xxd
-  xxd -i $FILE > $DIR/$FILE.hpp
-done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -51,26 +51,6 @@
      margin-bottom: 0.5em;
    }

-  button, input, textarea, .button, a.button, select {
-      color: #666;
-      border: 1px solid #ddd;
-      border-radius: 4px;
-      line-height: 1.5em;
-      padding: 0.25em 0.25em;
-      text-decoration: none;
-      font-size: 1.1rem;
-    }
-
-    button {
-      border: 1px solid #2a8aad;
-      background: #3584e4;
-      font-weight: normal;
-      color: #fff;
-    }
-    button:disabled {
-      background: #9cbce5;
-    }
-
    #write form {
      margin: 1em 0 0 0;
      display: flex;
@@ -587,7 +567,7 @@
        runCompletion();
      }
      return html`
-        <div class="right">
+        <div>
          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
          <button onclick=${reset}>Reset</button>
@@ -901,11 +881,11 @@
        .replace(/&/g, '&amp;')
        .replace(/</g, '&lt;')
        .replace(/>/g, '&gt;')
-        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
-        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
-        .replace(/__(.*?)__/g, '<strong>$1</strong>')
-        .replace(/\*(.*?)\*/g, '<em>$1</em>')
-        .replace(/_(.*?)_/g, '<em>$1</em>')
+        .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
+        .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
+        .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
+        .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
        .replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
        .replace(/`(.*?)`/g, '<code>$1</code>')
        .replace(/\n/gim, '<br />');
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -1,33 +1,95 @@
-// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
+// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
 const SPACE_RULE = '" "?';

+function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+  const separatorRule = opts.separatorRule ?? '';
+  const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
+
+  if (separatorRule === '') {
+    if (minItems === 0 && maxItems === 1) {
+      return `${itemRule}?`;
+    } else if (minItems === 1 && maxItems === undefined) {
+      return `${itemRule}+`;
+    }
+  }
+
+  let result = '';
+  if (minItems > 0) {
+    if (itemRuleIsLiteral && separatorRule === '') {
+      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
+    } else {
+      result = Array.from({ length: minItems }, () => itemRule)
+        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
+    }
+  }
+
+  const optRepetitions = (upToN, prefixWithSep=false) => {
+    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
+    if (upToN === 0) {
+      return '';
+    } else if (upToN === 1) {
+      return `(${content})?`;
+    } else if (separatorRule !== '' && !prefixWithSep) {
+      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
+    } else {
+      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
+    }
+  };
+
+  if (minItems > 0 && maxItems !== minItems) {
+    result += ' ';
+  }
+
+  if (maxItems !== undefined) {
+    result += optRepetitions(maxItems - minItems, minItems > 0);
+  } else {
+    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
+
+    if (minItems === 0 && separatorRule !== '') {
+      result = `(${itemRule} ${itemOperator}*)?`;
+    } else {
+      result += `${itemOperator}*`;
+    }
+  }
+
+  return result;
+}
+
+class BuiltinRule {
+  constructor(content, deps) {
+    this.content = content;
+    this.deps = deps || [];
+  }
+}
+
+const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
+
 const PRIMITIVE_RULES = {
-  boolean: '("true" | "false") space',
-  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
-  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
-  value: 'object | array | string | number | boolean',
-  object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
-  array: '"[" space ( value ("," space value)* )? "]" space',
-  uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
-  string: ` "\\"" (
-        [^"\\\\] |
-        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\\"" space`,
-  null: '"null" space',
+  boolean        : new BuiltinRule('("true" | "false") space', []),
+  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
+  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+  number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
+  integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
+  value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
+  object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
+  array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
+  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
+  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+  string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
+  null           : new BuiltinRule('"null" space', []),
 };
-const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];

 // TODO: support "uri", "email" string formats
-const DATE_RULES = {
-    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
-    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
-    'date-time': 'date "T" time',
-    'date-string': '"\\"" date "\\"" space',
-    'time-string': '"\\"" time "\\"" space',
-    'date-time-string': '"\\"" date-time "\\"" space',
-};
+const STRING_FORMAT_RULES = {
+  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
+  'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
+  'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
+  'date-time-string': new BuiltinRule('"\\"" date-time "\\"" space', ['date-time']),
+}

-const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...STRING_FORMAT_RULES};

 const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
 const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
@@ -158,7 +220,7 @@ export class SchemaConverter {
        rule = '[\\U00000000-\\U0010FFFF]';
      } else {
        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
-        rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
+        rule = '[^\\x0A\\x0D]';
      }
      return this._addRule('dot', rule);
    };
@@ -259,26 +321,19 @@ export class SchemaConverter {

          let [sub, subIsLiteral] = seq[seq.length - 1];

-          if (minTimes === 0 && maxTimes === Infinity) {
-            seq[seq.length - 1] = [`${sub}*`, false];
-          } else if (minTimes === 0 && maxTimes === 1) {
-            seq[seq.length - 1] = [`${sub}?`, false];
-          } else if (minTimes === 1 && maxTimes === Infinity) {
-            seq[seq.length - 1] = [`${sub}+`, false];
-          } else {
-            if (!subIsLiteral) {
-              let id = subRuleIds[sub];
-              if (id === undefined) {
-                id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
-                subRuleIds[sub] = id;
-              }
-              sub = id;
+          if (!subIsLiteral) {
+            let id = subRuleIds[sub];
+            if (id === undefined) {
+              id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+              subRuleIds[sub] = id;
            }
-
-            const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
-            const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
-            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
+            sub = id;
          }
+
+          seq[seq.length - 1] = [
+            _buildRepetition(subIsLiteral ? `"${sub}"` : sub, minTimes, maxTimes, {itemRuleIsLiteral: subIsLiteral}),
+            false
+          ];
        } else {
          let literal = '';
          while (i < length) {
@@ -394,49 +449,50 @@ export class SchemaConverter {
        );
      } else {
        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
-        const listItemOperator = `( "," space ${itemRuleName} )`;
-        let successiveItems = '';
-        let minItems = schema.minItems || 0;
+        const minItems = schema.minItems || 0;
        const maxItems = schema.maxItems;
-        if (minItems > 0) {
-          successiveItems = listItemOperator.repeat(minItems - 1);
-          minItems--;
-        }
-        if (maxItems !== undefined && maxItems > minItems) {
-          successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
-        } else {
-          successiveItems += `${listItemOperator}*`;
-        }
-        const rule = minItems === 0
-          ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
-          : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
-        return this._addRule(ruleName, rule);
+        return this._addRule(ruleName, '"[" space ' + _buildRepetition(itemRuleName, minItems, maxItems, {separatorRule: '"," space'}) + ' "]" space');
      }
    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
      return this._visitPattern(schema.pattern, ruleName);
    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
-      return this._addRule(
-          ruleName === 'root' ? 'root' : schemaFormat,
-          PRIMITIVE_RULES['uuid'])
-    } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
-      for (const [t, r] of Object.entries(DATE_RULES)) {
-        this._addRule(t, r);
-      }
-      return schemaFormat + '-string';
+      return this._addPrimitive(
+        ruleName === 'root' ? 'root' : schemaFormat,
+        PRIMITIVE_RULES['uuid']
+      );
+    } else if ((schemaType === undefined || schemaType === 'string') && `${schema.format}-string` in STRING_FORMAT_RULES) {
+      const primName = `${schema.format}-string`
+      return this._addRule(ruleName, this._addPrimitive(primName, STRING_FORMAT_RULES[primName]));
+    } else if (schemaType === 'string' && ('minLength' in schema || 'maxLength' in schema)) {
+      const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+      const minLen = schema.minLength || 0;
+      const maxLen = schema.maxLength;
+      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
-      for (const n of OBJECT_RULE_NAMES) {
-        this._addRule(n, PRIMITIVE_RULES[n]);
-      }
-      return this._addRule(ruleName, 'object');
+      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
    } else {
      if (!(schemaType in PRIMITIVE_RULES)) {
        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
      }
      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
-      return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+      return this._addPrimitive(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
    }
  }

+  _addPrimitive(name, rule) {
+    let n = this._addRule(name, rule.content);
+    for (const dep of rule.deps) {
+      const depRule = PRIMITIVE_RULES[dep] || STRING_FORMAT_RULES[dep];
+      if (!depRule) {
+        throw new Error(`Rule ${dep} not known`);
+      }
+      if (!(dep in this._rules)) {
+        this._addPrimitive(dep, depRule);
+      }
+    }
+    return n;
+  }
+
  _buildObjectRule(properties, required, name, additionalProperties) {
    const propOrder = this._propOrder;
    // sort by position in prop_order (if specified) then by original order
@@ -462,7 +518,7 @@ export class SchemaConverter {
      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
      propKvRuleNames['*'] = this._addRule(
        `${subName}-kv`,
-        `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
      optionalProps.push('*');
    }

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -854,12 +854,12 @@ struct server_context {
        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
        slot.params.n_discard          = json_value(data, "n_discard",         default_params.n_discard);
-        slot.params.seed               = json_value(data, "seed",              default_params.seed);
+        slot.sparams.seed              = json_value(data, "seed",              default_sparams.seed);
        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && data.contains("grammar")) {
+        if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
        } else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1028,7 +1028,6 @@ struct server_context {
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
-            llama_set_rng_seed(ctx, slot.params.seed);
        }

        slot.command = SLOT_COMMAND_LOAD_PROMPT;
@@ -1083,7 +1082,7 @@ struct server_context {
                };

                if (llama_decode(ctx, batch_view) != 0) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERROR("llama_decode() failed", {});
                    return;
                }
            }
@@ -1118,7 +1117,7 @@ struct server_context {

    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
        slot.sampled = result.tok;

        // search stop word and delete it
@@ -1201,13 +1200,34 @@ struct server_context {
            });
        }

-        if (result.tok == llama_token_eos(model)) {
+        if (llama_token_is_eog(model, result.tok)) {
            slot.stopped_eos    = true;
            slot.has_next_token = false;

            LOG_VERBOSE("eos token found", {});
        }

+        auto n_ctx_train = llama_n_ctx_train(model);
+        if (slot.params.n_predict < 1 && slot.ga_n == 1
+                    && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
+            LOG_WARNING("n_predict is not set and self-context extend is disabled."
+                        " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
+                    { "id_slot",              slot.id },
+                    { "params.n_predict",     slot.params.n_predict },
+                    { "slot.n_prompt_tokens", slot.n_prompt_tokens },
+                    { "slot.n_decoded",       slot.n_decoded },
+                    { "slot.n_predict",       slot.n_predict },
+                    { "n_slots",              params.n_parallel },
+                    { "slot.n_ctx",           slot.n_ctx },
+                    { "n_ctx",                n_ctx },
+                    { "n_ctx_train",          n_ctx_train },
+                    { "ga_n",                 slot.ga_n },
+                });
+            slot.truncated      = true;
+            slot.stopped_limit  = true;
+            slot.has_next_token = false; // stop prediction
+        }
+
        LOG_VERBOSE("next token", {
            {"id_slot",        slot.id},
            {"id_task",        slot.id_task},
@@ -1281,7 +1301,11 @@ struct server_context {
    }

    void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
+        LOG_ERROR("task error", {
+            {"id_multi", id_multi},
+            {"id_task", id_task},
+            {"error", error},
+        });

        server_task_result res;
        res.id       = id_task;
@@ -2138,7 +2162,7 @@ struct server_context {
        });

        // process the created batch of tokens
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);

            for (auto & slot : slots) {
@@ -2186,7 +2210,11 @@ struct server_context {
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
+                        {"i",   i},
+                        {"n_batch",  ret},
+                        {"ret",   ret},
+                    });
                    for (auto & slot : slots) {
                        slot.state = SLOT_STATE_PROCESSING;
                        slot.command = SLOT_COMMAND_NONE;
@@ -2196,12 +2224,16 @@ struct server_context {
                    break; // break loop of n_batch
                }

-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
-
                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
                i -= n_batch;

+                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
+                    {"i",   i},
+                    {"n_batch",  n_batch},
+                    {"ret",   ret},
+                });
+
                continue; // continue loop of n_batch
            }

@@ -2360,7 +2392,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                            types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
    printf("  --chat-template JINJA_TEMPLATE\n");
@@ -2791,43 +2823,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
+            if (!parse_kv_override(argv[i], params.kv_overrides)) {
                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
            }
-            params.kv_overrides.push_back(kvo);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            server_print_usage(argv[0], default_params, default_sparams);
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -29,7 +29,7 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
 cd ../../..
 mkdir build
 cd build
-cmake ../
+cmake -DLLAMA_CURL=ON ../
 cmake --build . --target server
 ```

--- a/examples/server/tests/features/results.feature
+++ b/examples/server/tests/features/results.feature
@@ -0,0 +1,57 @@
+@llama.cpp
+@results
+Feature: Results
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file test-model-00001-of-00003.gguf
+    And   128 as batch size
+    And   256 KV cache size
+    And   128 max tokens to predict
+
+  Scenario Outline: Multi users completion
+    Given <n_slots> slots
+    And   continuous batching
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given 42 as seed
+    And a prompt:
+      """
+      Write a very long story about AI.
+      """
+
+    Given concurrent completion requests
+    Then the server is busy
+    Then the server is idle
+    And  all slots are idle
+    Then all predictions are equal
+    Examples:
+      | n_slots |
+      | 1       |
+      | 2       |
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
    context.server_metrics = False
    context.server_process = None
    context.seed = None
+    context.draft = None
    context.server_seed = None
    context.user_api_key = None
    context.response_format = None
@@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
    context.n_gpu_layer = ngl


+@step('{draft:d} as draft')
+def step_draft(context, draft):
+    context.draft = draft
+
+
@step('{n_ctx:d} KV cache size')
 def step_n_ctx(context, n_ctx):
    context.n_ctx = n_ctx
@@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
    assert_n_tokens_predicted(context.completion, predicted_n)


+@step('all predictions are equal')
+@async_run_until_complete
+async def step_predictions_equal(context):
+    n_completions = await gather_tasks_results(context)
+    assert n_completions >= 2, "need at least 2 completions"
+    assert_all_predictions_equal(context.tasks_result)
+    context.tasks_result = []
+
+
@step('the completion is  truncated')
 def step_assert_completion_truncated(context):
    step_assert_completion_truncated(context, '')
@@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
                                                     f' {n_predicted} <> {expected_predicted_n}')

+def assert_all_predictions_equal(completion_responses):
+    content_0 = completion_responses[0]['content']
+
+    if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+        print(f"content 0: {content_0}")
+
+    i = 1
+    for response in completion_responses[1:]:
+        content = response['content']
+
+        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+            print(f"content {i}: {content}")
+
+        assert content == content_0, "contents not equal"
+
+        i += 1
+

 async def gather_tasks_results(context):
    n_tasks = len(context.concurrent_tasks)
@@ -1148,6 +1180,8 @@ def start_server_background(context):
        server_args.extend(['--ubatch-size', context.n_ubatch])
    if context.n_gpu_layer:
        server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
+    if context.draft is not None:
+        server_args.extend(['--draft', context.draft])
    if context.server_continuous_batching:
        server_args.append('--cont-batching')
    if context.server_embeddings:
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -4,9 +4,8 @@ set -eu

 if [ $# -lt 1 ]
 then
-  # Start @llama.cpp scenario
-  behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    # Start @llama.cpp scenario
+    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 else
-  behave "$@"
+    behave "$@"
 fi
-
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -381,10 +381,6 @@ static json oaicompat_completion_params_parse(
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
-    // Some chat templates don't use EOS token to stop generation
-    // We must add their end sequences to list of stop words
-    llama_params["stop"].push_back("<|im_end|>"); // chatml
-    llama_params["stop"].push_back("<end_of_turn>"); // gemma

    // Handle "response_format" field
    if (body.contains("response_format")) {
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -133,8 +133,8 @@ int main(int argc, char ** argv) {
            // sample the most likely token
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

-            // is it an end of stream?
-            if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");

                break;
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -360,7 +360,7 @@ int main(int argc, char ** argv) {
                    }
                }

-                if (token_id == llama_token_eos(model_tgt)) {
+                if (llama_token_is_eog(model_tgt, token_id)) {
                    has_eos = true;
                }
                ++n_predict;
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #cmake --build . --config Release --target llama-bench

 #build all binary
-cmake --build . --config Release -v
+cmake --build . --config Release -j -v
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -12,6 +12,7 @@ if [ $# -gt 0 ]; then
    GGML_SYCL_SINGLE_GPU=1
 else
    GGML_SYCL_DEVICE=0
+    GGML_SYCL_SINGLE_GPU=0
 fi

 #export GGML_SYCL_DEBUG=1
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -73,6 +73,7 @@ struct my_llama_model {
 static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL     = "train_model";
 static const char * LLM_KV_TRAINING_TYPE                 = "training.type";

+static const char * LLM_KV_GENERAL_NAME                = "general.name";
 static const char * LLM_KV_GENERAL_ARCHITECTURE        = "general.architecture";
 static const char * LLM_KV_GENERAL_FILE_TYPE           = "general.file_type";

@@ -529,6 +530,7 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex

 static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
    const char * arch = "llama";
+
    enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;

    std::vector<char> keybuf;
@@ -540,6 +542,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo

    // set arch
    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
+    gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch);
    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);

    // set hparams
--- a/examples/ts-type-to-grammar.sh
+++ b/examples/ts-type-to-grammar.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 #
 # ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
-# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json
+# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json
 #
 set -euo pipefail

@@ -25,4 +25,4 @@ npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type M
 # https://github.com/YousefED/typescript-json-schema
 # npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2

-./examples/json-schema-to-grammar.py "$SCHEMA_FILE"
+./examples/json_schema_to_grammar.py "$SCHEMA_FILE"
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1712163089,
-        "narHash": "sha256-Um+8kTIrC19vD4/lUCN9/cU9kcOsD1O1m+axJqQPyMM=",
+        "lastModified": 1713537308,
+        "narHash": "sha256-XtTSSIB2DA6tOv+l0FhvfDMiyCmhoRbNB+0SeInZkbk=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "fd281bd6b7d3e32ddfa399853946f782553163b5",
+        "rev": "5c24cf2f0a12ad855f444c30b2421d044120c66f",
        "type": "github"
      },
      "original": {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -371,16 +371,16 @@ struct ggml_gallocr {
 };

 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
-    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
+    ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
    GGML_ASSERT(galloc != NULL);

-    galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
+    galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
    GGML_ASSERT(galloc->bufts != NULL);

-    galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
+    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
    GGML_ASSERT(galloc->buffers != NULL);

-    galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
+    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);

    for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        free(galloc->hash_set.keys);
        free(galloc->hash_values);
        galloc->hash_set.size = hash_size;
-        galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
-        galloc->hash_values   = calloc(sizeof(struct hash_node), hash_size);
+        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
+        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
        GGML_ASSERT(galloc->hash_set.keys != NULL);
        GGML_ASSERT(galloc->hash_values != NULL);
    } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
    // set the node_allocs from the hash table
    if (galloc->n_nodes < graph->n_nodes) {
        free(galloc->node_allocs);
-        galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
+        galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
        GGML_ASSERT(galloc->node_allocs != NULL);
    }
    galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
    }
    if (galloc->n_leafs < graph->n_leafs) {
        free(galloc->leaf_allocs);
-        galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
+        galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
        GGML_ASSERT(galloc->leaf_allocs != NULL);
    }
    galloc->n_leafs = graph->n_leafs;
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_CPY:
-            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
+            return
+                op->type != GGML_TYPE_IQ2_XXS &&
+                op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ1_S   &&
+                op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
        case GGML_OP_MUL_MAT:
            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
        default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU

-    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
+    struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));

    // initialize hash table
    sched->hash_set          = ggml_hash_set_new(graph_size);
-    sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
-    sched->tensor_copies     = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
+    sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
+    sched->tensor_copies     = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));

    const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
-    sched->node_backend_ids  = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
-    sched->leaf_backend_ids  = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
+    sched->node_backend_ids  = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
+    sched->leaf_backend_ids  = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));

    sched->n_backends = n_backends;

    sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;

    const int initial_splits_capacity = 16;
-    sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
+    sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
    sched->splits_capacity = initial_splits_capacity;

    for (int b = 0; b < n_backends; b++) {
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
 struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
    struct ggml_hash_set hash_set = {
        /* .size = */ graph->visited_hash_table.size,
-        /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
+        /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
    };
-    struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
-    bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
+    struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
+    bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));

    struct ggml_init_params params = {
        /* .mem_size   = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1231,7 +1231,7 @@ static void ggml_cuda_op_mul_mat_cublas(

    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
+        ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
        if (src0->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
        }
        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();

-        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
+        ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
        if (src1->type != GGML_TYPE_F16) {
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
        }
        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
-        ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
+        ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);

        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
-    } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
    }
 }

+struct mmid_row_mapping {
+    int32_t i1;
+    int32_t i2;
+};
+
+static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
+                                                 int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
+                                                 const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
+                                                 int64_t ne11, int64_t ne10,
+                                                 size_t nb11, size_t nb12) {
+    int32_t iid1 = blockIdx.x;
+    int32_t id = blockIdx.y;
+
+    const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
+
+    if (row_id_i != i02) {
+        return;
+    }
+
+    const int64_t i11 = id % ne11;
+    const int64_t i12 = iid1;
+
+    __shared__ int src1_row;
+    if (threadIdx.x == 0) {
+        src1_row = atomicAdd(cur_src1_row, 1);
+        row_mapping[src1_row] = {id, iid1};
+    }
+    __syncthreads();
+
+    const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
+    float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
+
+    for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
+        src1_row_contiguous[i] = src1_row_original[i];
+    }
+}
+
+static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
+                                                  const mmid_row_mapping * __restrict__ row_mapping,
+                                                  int64_t ne0,
+                                                  size_t nb1, size_t nb2) {
+    int32_t i = blockIdx.x;
+
+    const int32_t i1 = row_mapping[i].i1;
+    const int32_t i2 = row_mapping[i].i2;
+
+    const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
+    float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
+
+    for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
+        dst_row_original[j] = dst_row_contiguous[j];
+    }
+}
+
 static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    const ggml_tensor * ids  = dst->src[2];

+    GGML_TENSOR_BINARY_OP_LOCALS
+
    GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");

    cudaStream_t stream = ctx.stream();

-    const size_t nb11 = src1->nb[1];
-    const size_t nb1  =  dst->nb[1];
-
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = src0->ne[2];
+    const int64_t n_as = ne02;
+    const int64_t n_ids = ids->ne[0];

    std::vector<char> ids_host(ggml_nbytes(ids));
    const char * ids_dev = (const char *) ids->data;
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

    ggml_tensor src0_row = *src0;
    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
+    ggml_tensor dst_row  = *dst;

    char * src0_original = (char *) src0->data;
    char * src1_original = (char *) src1->data;
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *

    src0_row.ne[2] = 1;
    src0_row.ne[3] = 1;
-    src0_row.nb[3] = src0->nb[2];
+    src0_row.nb[3] = nb02;

-    if (src1->ne[1] == 1) {
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+    src1_row.ne[1] = 1;
+    src1_row.ne[2] = 1;
+    src1_row.ne[3] = 1;
+    src1_row.nb[2] = nb11;
+    src1_row.nb[3] = nb11;

-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
+    dst_row.ne[1] = 1;
+    dst_row.ne[2] = 1;
+    dst_row.ne[3] = 1;
+    dst_row.nb[2] = nb1;
+    dst_row.nb[3] = nb1;

-            src0_row.data = src0_original + row_id*src0->nb[2];
-            src1_row.data = src1_original + i01*src1->nb[1];
-            dst_row.data  =  dst_original + i01*dst->nb[1];
+    if (ne12 == 1) {
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+            for (int64_t id = 0; id < n_ids; id++) {
+                const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);

-            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+                GGML_ASSERT(i02 >= 0 && i02 < n_as);
+
+                const int64_t i11 = id % ne11;
+                const int64_t i12 = iid1;
+
+                const int64_t i1 = id;
+                const int64_t i2 = i12;
+
+                src0_row.data = src0_original + i02*nb02;
+                src1_row.data = src1_original + i11*nb11 + i12*nb12;
+                dst_row.data  =  dst_original + i1*nb1   + i2*nb2;
+
+                ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+            }
        }
    } else {
        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
        src1_row.data = src1_contiguous.get();
        dst_row.data  =  dst_contiguous.get();

-        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+        for (int64_t i02 = 0; i02 < n_as; i02++) {
            int64_t num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);

-                if (row_id_i != row_id) {
-                    continue;
+            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+                for (int64_t id = 0; id < n_ids; id++) {
+                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+
+                    if (row_id_i != i02) {
+                        continue;
+                    }
+
+                    num_src1_rows++;
                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
-                                        nb11, cudaMemcpyDeviceToDevice, stream));
-                num_src1_rows++;
            }

            if (num_src1_rows == 0) {
                continue;
            }

-            src0_row.data = src0_original + row_id*src0->nb[2];
+            ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
+            ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
+            CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
+
+            {
+                dim3 block_dims(std::min((unsigned int)ne10, 768u));
+                dim3 grid_dims(ids->ne[1], n_ids);
+                k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+                        src1_original, src1_contiguous.get(),
+                        dev_cur_src1_row.get(), dev_row_mapping.get(),
+                        ids_dev, i02, ids->nb[1], ids->nb[0],
+                        ne11, ne10,
+                        nb11, nb12);
+                CUDA_CHECK(cudaGetLastError());
+            }
+
+            src0_row.data = src0_original + i02*nb02;
+
+            GGML_ASSERT(nb11 == sizeof(float)*ne10);
+            GGML_ASSERT(nb1 == sizeof(float)*ne0);

            src1_row.ne[1] = num_src1_rows;
-            dst_row.ne[1] = num_src1_rows;
-
            src1_row.nb[1] = nb11;
            src1_row.nb[2] = num_src1_rows*nb11;
            src1_row.nb[3] = num_src1_rows*nb11;

+            dst_row.ne[1] = num_src1_rows;
            dst_row.nb[1] = nb1;
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;

            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);

-            num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
-                                        nb1, cudaMemcpyDeviceToDevice, stream));
-                num_src1_rows++;
+            {
+                dim3 block_dims(std::min((unsigned int)ne0, 768u));
+                dim3 grid_dims(num_src1_rows);
+                k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+                        dst_original, dst_contiguous.get(),
+                        dev_row_mapping.get(),
+                        ne0,
+                        nb1, nb2);
+                CUDA_CHECK(cudaGetLastError());
            }
        }
    }
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;

-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+    return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
+           (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);

    GGML_UNUSED(backend);
 }
--- a/ggml-cuda/binbcast.cu
+++ b/ggml-cuda/binbcast.cu
@@ -22,6 +22,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
        /*int s10,*/ int s11, int s12, int s13) {
    const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
    const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
@@ -36,9 +37,9 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;

-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
@@ -55,6 +56,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
        int ne0, int ne1, int ne2, int ne3,
        int ne10, int ne11, int ne12, int ne13,
        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s00,*/ int s01, int s02, int s03,
        /*int s10,*/ int s11, int s12, int s13) {

    const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -72,9 +74,9 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
    const int i12 = i2 % ne12;
    const int i13 = i3 % ne13;

-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src0 =  i3*s03 +  i2*s02 +  i1*s01;
    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
+    const size_t i_dst  =  i3*s3  +  i2*s2  +  i1*s1;

    const src0_t * src0_row = src0 + i_src0;
    const src1_t * src1_row = src1 + i_src1;
@@ -101,10 +103,14 @@ struct bin_bcast_cuda {
        int nr[4] = { nr0, nr1, nr2, nr3 };

        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne[] = {ne0, ne1, ne2, ne3};
+        int64_t cne0[] = {ne00, ne01, ne02, ne03};
        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+
+        size_t cnb[] = {nb0, nb1, nb2, nb3};
+        size_t cnb0[] = {nb00, nb01, nb02, nb03};
        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+
        auto collapse = [](int64_t cne[]) {
            cne[0] *= cne[1];
            cne[1] = cne[2];
@@ -118,32 +124,47 @@ struct bin_bcast_cuda {
            cnb[3] *= cne[3];
        };

-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
+            for (int i = 0; i < 4; i++) {
+                if (nr[i] != 1) {
+                    break;
+                }
+                if (i > 0) {
+                    collapse_nb(cnb, cne);
+                    collapse_nb(cnb0, cne0);
+                    collapse_nb(cnb1, cne1);
+                    collapse(cne);
+                    collapse(cne0);
+                    collapse(cne1);
+                }
            }
        }
+
        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
+            int64_t ne0 = cne[0];
+            int64_t ne1 = cne[1];
+            int64_t ne2 = cne[2];
+            int64_t ne3 = cne[3];
+
+            //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00);
+            //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01);
+            //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
+            //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);

            int64_t ne10 = cne1[0];
            int64_t ne11 = cne1[1];
            int64_t ne12 = cne1[2];
            int64_t ne13 = cne1[3];

-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
+            size_t nb0 = cnb[0];
+            size_t nb1 = cnb[1];
+            size_t nb2 = cnb[2];
+            size_t nb3 = cnb[3];
+
+            size_t nb00 = cnb0[0];
+            size_t nb01 = cnb0[1];
+            size_t nb02 = cnb0[2];
+            size_t nb03 = cnb0[3];

            size_t nb10 = cnb1[0];
            size_t nb11 = cnb1[1];
@@ -160,7 +181,28 @@ struct bin_bcast_cuda {
            size_t s12 = nb12 / sizeof(src1_t);
            size_t s13 = nb13 / sizeof(src1_t);

+            size_t s00 = nb00 / sizeof(src0_t);
+            size_t s01 = nb01 / sizeof(src0_t);
+            size_t s02 = nb02 / sizeof(src0_t);
+            size_t s03 = nb03 / sizeof(src0_t);
+
+            GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
+            GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
+
+            GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
+            GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
+
+            GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
+            GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
+
            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s00 == 1);
            GGML_ASSERT(s10 == 1);

            const int block_size = 128;
@@ -179,13 +221,14 @@ struct bin_bcast_cuda {
            );

            if (block_nums.z > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel
                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
                k_bin_bcast_unravel<bin_op><<<block_num, block_size, 0, stream>>>(
                    src0_dd, src1_dd, dst_dd,
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
+                    /* s00, */ s01, s02, s03,
                    /* s10, */ s11, s12, s13);
            } else {
                k_bin_bcast<bin_op><<<block_nums, block_dims, 0, stream>>>(
@@ -193,6 +236,7 @@ struct bin_bcast_cuda {
                    ne0, ne1, ne2, ne3,
                    ne10, ne11, ne12, ne13,
                    /* s0, */ s1, s2, s3,
+                    /* s00, */ s01, s02, s03,
                    /* s10, */ s11, s12, s13);
            }
        }
--- a/ggml-cuda/convert.cu
+++ b/ggml-cuda/convert.cu
@@ -45,6 +45,8 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
        vals[ix] = x0[ix];
    }

+    __syncthreads();
+
 #pragma unroll
    for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
        if (need_check && i0 + iy + 2*threadIdx.x >= k) {
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -11,6 +11,12 @@
 #include <string.h> // memcpy
 #include <math.h>   // fabsf

+#undef MIN
+#undef MAX
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -45,7 +51,7 @@ extern "C" {
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
+#if defined(__ARM_NEON)

 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
@@ -53,8 +59,262 @@ extern "C" {
 //
 #include <arm_neon.h>

+#ifdef _MSC_VER
+
+typedef uint16_t ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
 typedef __fp16 ggml_fp16_internal_t;

+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif // _MSC_VER
+
+#if !defined(__aarch64__)
+
+// 32-bit ARM compatibility
+
+// vaddvq_s16
+// vpaddq_s16
+// vpaddq_s32
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
+
+inline static int32_t vaddvq_s16(int16x8_t v) {
+    return
+        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
+        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
+        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
+        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+    return vcombine_s32(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+    int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+    ggml_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+// NOTE: not tested
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+    uint8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t  int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t  int8x16x2_t
+#define ggml_int8x16x4_t  int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2  vld1q_u8_x2
+#define ggml_vld1q_u8_x4  vld1q_u8_x4
+#define ggml_vld1q_s8_x2  vld1q_s8_x2
+#define ggml_vld1q_s8_x4  vld1q_s8_x4
+#define ggml_vqtbl1q_s8   vqtbl1q_s8
+#define ggml_vqtbl1q_u8   vqtbl1q_u8
+
+#endif // !defined(__aarch64__)
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+#endif // defined(__ARM_NEON)
+
+#if defined(__ARM_NEON) && !defined(__MSC_VER)
+
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)

@@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {

 #else

-typedef uint16_t ggml_fp16_internal_t;
-
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
@@ -88,7 +346,7 @@ typedef uint16_t ggml_fp16_internal_t;
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 #if !defined(__riscv)
 #include <immintrin.h>
 #endif
@@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {

 #endif // __F16C__

-#endif // __ARM_NEON
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)

 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -37,11 +37,15 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_DIV_ROW,
    GGML_METAL_KERNEL_TYPE_SCALE,
    GGML_METAL_KERNEL_TYPE_SCALE_4,
+    GGML_METAL_KERNEL_TYPE_CLAMP,
    GGML_METAL_KERNEL_TYPE_TANH,
    GGML_METAL_KERNEL_TYPE_RELU,
    GGML_METAL_KERNEL_TYPE_GELU,
+    GGML_METAL_KERNEL_TYPE_GELU_4,
    GGML_METAL_KERNEL_TYPE_GELU_QUICK,
+    GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
    GGML_METAL_KERNEL_TYPE_SILU,
+    GGML_METAL_KERNEL_TYPE_SILU_4,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX,
    GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,
    GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,
@@ -468,11 +472,15 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP,                     clamp,                  true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                    gelu_4,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,              gelu_quick_4,           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4,                    silu_4,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,               ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,             ctx->support_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,          true);
@@ -713,6 +721,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_SCALE:
+        case GGML_OP_CLAMP:
        case GGML_OP_SQR:
        case GGML_OP_SUM_ROWS:
            return true;
@@ -1154,8 +1163,30 @@ static enum ggml_status ggml_metal_graph_compute(

                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
+                case GGML_OP_CLAMP:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
+
+                    float min;
+                    float max;
+                    memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
+                    memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                    [encoder setBytes:&min length:sizeof(min) atIndex:2];
+                    [encoder setBytes:&max length:sizeof(max) atIndex:3];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                case GGML_OP_UNARY:
                    switch (ggml_get_unary_op(gf->nodes[i])) {
+                        // we are not taking into account the strides, so for now require contiguous tensors
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+
                        case GGML_UNARY_OP_TANH:
                            {
                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
@@ -1182,42 +1213,60 @@ static enum ggml_status ggml_metal_graph_compute(
                            } break;
                        case GGML_UNARY_OP_GELU:
                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+                                int64_t n = ggml_nelements(dst);
+
+                                id<MTLComputePipelineState> pipeline = nil;
+
+                                if (n % 4 == 0) {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline;
+                                    n /= 4;
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
+                                }

                                [encoder setComputePipelineState:pipeline];
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                const int64_t n = ggml_nelements(dst);
-                                GGML_ASSERT(n % 4 == 0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        case GGML_UNARY_OP_GELU_QUICK:
                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+                                int64_t n = ggml_nelements(dst);
+
+                                id<MTLComputePipelineState> pipeline = nil;
+
+                                if (n % 4 == 0) {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline;
+                                    n /= 4;
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
+                                }

                                [encoder setComputePipelineState:pipeline];
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                const int64_t n = ggml_nelements(dst);
-                                GGML_ASSERT(n % 4 == 0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        case GGML_UNARY_OP_SILU:
                            {
-                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+                                int64_t n = ggml_nelements(dst);
+
+                                id<MTLComputePipelineState> pipeline = nil;
+
+                                if (n % 4 == 0) {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline;
+                                    n /= 4;
+                                } else {
+                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
+                                }

                                [encoder setComputePipelineState:pipeline];
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                const int64_t n = ggml_nelements(dst);
-                                GGML_ASSERT(n % 4 == 0);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                            } break;
                        default:
                            {
@@ -1683,15 +1732,10 @@ static enum ggml_status ggml_metal_graph_compute(
                    } break;
                case GGML_OP_MUL_MAT_ID:
                    {
-                        //GGML_ASSERT(ne00 == ne10);
-                        //GGML_ASSERT(ne03 == ne13);
                        const int n_as = src0->ne[2];

-                        // max size of the src1ids array in the kernel shared buffer
-                        GGML_ASSERT(ne11 <= 4096);
-
                        // src2 = ids
-                        const int64_t  ne20 = src2->ne[0]; GGML_UNUSED(ne20);
+                        const int64_t  ne20 = src2->ne[0];
                        const int64_t  ne21 = src2->ne[1];
                        const int64_t  ne22 = src2->ne[2]; GGML_UNUSED(ne22);
                        const int64_t  ne23 = src2->ne[3]; GGML_UNUSED(ne23);
@@ -1712,15 +1756,13 @@ static enum ggml_status ggml_metal_graph_compute(

                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                        // to the matrix-vector kernel
-                        int ne11_mm_min = n_as;
+                        // ne20 = n_used_experts
+                        // ne21 = n_rows
+                        const int dst_rows = ne20*ne21;
+                        const int dst_rows_min = n_as;

-                        const int idx = ((int32_t *) dst->op_params)[0];
-
-                        // batch size
-                        GGML_ASSERT(ne21 == ne11); // ?
-                        GGML_ASSERT(ne12 == 1 && ne13 == 1); // no broadcasting
-                        const uint r2 = 1;
-                        const uint r3 = 1;
+                        // max size of the rowids array in the kernel shared buffer
+                        GGML_ASSERT(dst_rows <= 2048);

                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
@@ -1730,7 +1772,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        // !!!
                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                            ne00 % 32 == 0 && ne00 >= 64 &&
-                            ne11 > ne11_mm_min) {
+                            dst_rows > dst_rows_min) {

                            // some Metal matrix data types require aligned pointers
                            // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
@@ -1772,26 +1814,26 @@ static enum ggml_status ggml_metal_graph_compute(
                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
                            [encoder setBuffer:id_src2 offset:offs_src2    atIndex:3];
-                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:4];
-                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:5];
-                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:9];
-                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:10];
-                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:11];
-                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:12];
-                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:13];
-                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:14];
-                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:15];
-                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:16];
-                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:17];
-                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:18];
-                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:19];
+                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
+                            [encoder setBytes:&ne21    length:sizeof(ne21) atIndex:5];
+                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
+                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:7];
+                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:8];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:9];
+                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:10];
+                            [encoder setBytes:&ne11    length:sizeof(ne11) atIndex:11];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:12];
+                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:13];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:14];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:15];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:16];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:17];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:18];
+                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:19];

-                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];

-                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                        } else {
                            int nth0 = 32;
                            int nth1 = 1;
@@ -1926,7 +1968,12 @@ static enum ggml_status ggml_metal_graph_compute(
                                    {
                                        nth0 = 4;
                                        nth1 = 16;
+                                    #if QK_K == 64
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
+                                    #else
                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
+                                    #endif
+
                                    } break;
                                default:
                                    {
@@ -1939,72 +1986,72 @@ static enum ggml_status ggml_metal_graph_compute(
                                GGML_ASSERT(ne00 >= nth0*nth1);
                            }

-                            const int64_t _ne1 = 1; // kernels needs a reference in constant memory
-
                            [encoder setComputePipelineState:pipeline];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                            [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3];
-                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:4];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:5];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:6];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:7];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:8];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
-                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:19];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:20];
-                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:21];
-                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:22];
-                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:23];
+                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
+                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
+                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9];
+                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11];
+                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:20];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:21];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:22];
+
+                            const int64_t _ne1 = 1;
+                            const int tgz = dst_rows;

                            if (src0t == GGML_TYPE_Q4_0  || src0t == GGML_TYPE_Q4_1  || src0t == GGML_TYPE_Q5_0 ||
                                src0t == GGML_TYPE_Q5_1  || src0t == GGML_TYPE_Q8_0  || src0t == GGML_TYPE_Q2_K ||
                                src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) {
                                const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4;
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) {
                                const int mem_size = 32*sizeof(float);
                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q4_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                            }
                            else if (src0t == GGML_TYPE_Q5_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                            else if (src0t == GGML_TYPE_Q6_K) {
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            } else {
-                                const int64_t ny = (_ne1 + nrows - 1)/nrows;
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne21*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                            }
                        }
                    } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -14,47 +14,6 @@
 #include <stdlib.h> // for qsort
 #include <stdio.h>  // for GGML_ASSERT

-#ifdef __ARM_NEON
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
 #define UNUSED GGML_UNUSED

 // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -132,7 +91,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }

 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
+#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
    const __m256i zero = _mm256_setzero_si256();
    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
    return _mm256_cvtepi32_ps(summed_pairs);
@@ -276,258 +235,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
 #endif // __AVX__ || __AVX2__ || __AVX512F__
 #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)

-#if defined(__ARM_NEON)
-
-#ifdef _MSC_VER
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif
-
-#if !defined(__aarch64__)
-
-// 64-bit compatibility
-
-// vaddvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
-    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
-    return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vld1q_s16(ptr + 0);
-    res.val[1] = vld1q_s16(ptr + 8);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-    res.val[2] = vld1q_u8(ptr + 32);
-    res.val[3] = vld1q_u8(ptr + 48);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x2_t {
-    int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
-    ggml_int8x16x2_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-    res.val[2] = vld1q_s8(ptr + 32);
-    res.val[3] = vld1q_s8(ptr + 48);
-
-    return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-// NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t  int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t  int8x16x2_t
-#define ggml_int8x16x4_t  int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2  vld1q_u8_x2
-#define ggml_vld1q_u8_x4  vld1q_u8_x4
-#define ggml_vld1q_s8_x2  vld1q_s8_x2
-#define ggml_vld1q_s8_x4  vld1q_s8_x4
-#define ggml_vqtbl1q_s8   vqtbl1q_s8
-#define ggml_vqtbl1q_u8   vqtbl1q_u8
-
-#endif
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
-    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
-    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif
-
-#endif
-
 #if defined(__ARM_NEON) || defined(__wasm_simd128__)
 #define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
 #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
@@ -12676,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
    block_iq2_s * restrict y = vy;
    quantize_row_iq2_s_reference(x, y, k);
 }
+
+static bool validate_float(float f, size_t i) {
+    if (isinf(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
+        return false;
+    }
+
+    if (isnan(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
+        return false;
+    }
+
+    return true;
+}
+
+static bool isinf_fp16(ggml_fp16_t f) {
+    return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
+}
+
+static bool isnan_fp16(ggml_fp16_t f) {
+    return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
+}
+
+static bool validate_fp16(ggml_fp16_t f, size_t i) {
+    if (isinf_fp16(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
+        return false;
+    }
+
+    if (isnan_fp16(f)) {
+        fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
+        return false;
+    }
+
+    return true;
+}
+
+#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        if (!validate_fp16(q[i].d, i)) { \
+            return false; \
+        } \
+    }
+
+#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
+    const type * q = (const type *) (data); \
+    for (size_t i = 0; i < (nb); ++i) { \
+        if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
+            return false; \
+        } \
+    }
+
+bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
+    if (type < 0 || type >= GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid type %d\n", __func__, type);
+        return false;
+    }
+
+    if (nbytes % ggml_type_size(type) != 0) {
+        fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
+        return false;
+    }
+
+    const size_t nb = nbytes/ggml_type_size(type);
+
+    switch (type) {
+        case GGML_TYPE_F16:
+            {
+                const ggml_fp16_t * f = (const ggml_fp16_t *) data;
+                size_t i = 0;
+#if defined(__AVX2__)
+                for (; i + 15 < nb; i += 16) {
+                    __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
+                    __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
+                    __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
+                    int mask = _mm256_movemask_epi8(cmp);
+                    if (mask) {
+                        for (size_t j = 0; j < 16; ++j) {
+                            if (!validate_fp16(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#elif defined(__ARM_NEON)
+                for (; i + 7 < nb; i += 8) {
+                    uint16x8_t v = vld1q_u16(f + i);
+                    uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
+                    uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
+                    uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
+                    if (mask) {
+                        for (size_t j = 0; j < 8; ++j) {
+                            if (!validate_fp16(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#endif
+                for (; i < nb; ++i) {
+                    if (!validate_fp16(f[i], i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_F32:
+            {
+                const float * f = (const float *) data;
+                size_t i = 0;
+#if defined(__AVX2__)
+                for (; i + 7 < nb; i += 8) {
+                    __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
+                    __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
+                    __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
+                    int mask = _mm256_movemask_epi8(cmp);
+                    if (mask) {
+                        for (size_t j = 0; j < 8; ++j) {
+                            if (!validate_float(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#elif defined(__ARM_NEON)
+                for (; i + 3 < nb; i += 4) {
+                    uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
+                    uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
+                    uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
+                    uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
+                    if (mask) {
+                        for (size_t j = 0; j < 4; ++j) {
+                            if (!validate_float(f[i + j], i + j)) {
+                                return false;
+                            }
+                        }
+                        GGML_UNREACHABLE();
+                    }
+                }
+#endif
+                for (; i < nb; ++i) {
+                    if (!validate_float(f[i], i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_F64:
+            {
+                const double * f = (const double *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    if (!validate_float(f[i], i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_Q4_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
+            } break;
+        case GGML_TYPE_Q4_1:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
+            } break;
+        case GGML_TYPE_Q5_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
+            } break;
+        case GGML_TYPE_Q5_1:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
+            } break;
+        case GGML_TYPE_Q8_0:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
+            } break;
+        case GGML_TYPE_Q2_K:
+            {
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
+            } break;
+        case GGML_TYPE_Q3_K:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
+            } break;
+        case GGML_TYPE_Q4_K:
+            {
+            #ifdef GGML_QKK_64
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
+            #else
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
+            #endif
+            } break;
+        case GGML_TYPE_Q5_K:
+            {
+            #ifdef GGML_QKK_64
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
+            #else
+                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
+            #endif
+            } break;
+        case GGML_TYPE_Q6_K:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
+            } break;
+        case GGML_TYPE_Q8_K:
+            {
+                const block_q8_K * q = (const block_q8_K *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                    if (!validate_float(q[i].d, i)) {
+                        return false;
+                    }
+                }
+            } break;
+        case GGML_TYPE_IQ1_S:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
+            } break;
+        case GGML_TYPE_IQ1_M:
+            {
+                const block_iq1_m * q = (const block_iq1_m *) data;
+                for (size_t i = 0; i < nb; ++i) {
+                #if QK_K == 64
+                    if (!validate_fp16(q[i].d, i)) {
+                        return false;
+                    }
+                #else
+                    iq1m_scale_t scale;
+                    const uint16_t * sc = (const uint16_t *)q[i].scales;
+                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+                    if (!validate_fp16(scale.f16, i)) {
+                        return false;
+                    }
+                #endif
+                }
+            } break;
+        case GGML_TYPE_IQ2_XXS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
+            } break;
+        case GGML_TYPE_IQ2_XS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
+            } break;
+        case GGML_TYPE_IQ2_S:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
+            } break;
+        case GGML_TYPE_IQ3_XXS:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
+            } break;
+
+        case GGML_TYPE_IQ3_S:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
+            } break;
+        case GGML_TYPE_IQ4_XS:
+        #if QK_K != 64
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
+            } break;
+        #endif
+        // with QK_K == 64, iq4_xs is iq4_nl
+        case GGML_TYPE_IQ4_NL:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
+            } break;
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_I64:
+            // nothing to validate
+            break;
+        default:
+            {
+                fprintf(stderr, "%s: invalid type %d\n", __func__, type);
+                return false;
+            }
+    }
+
+    return true;
+}
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
 #define SYCL_SCALE_BLOCK_SIZE 256
 #define SYCL_CLAMP_BLOCK_SIZE 256
 #define SYCL_ROPE_BLOCK_SIZE 256
-#define SYCL_SOFT_MAX_BLOCK_SIZE 1024
 #define SYCL_ALIBI_BLOCK_SIZE 32
 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
 #define SYCL_QUANTIZE_BLOCK_SIZE 256
@@ -13080,11 +13079,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
                              const int nrows_y, const float scale, const float max_bias,
                              dpct::queue_ptr stream) {
    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    int max_block_size = g_work_group_size;
+    while (nth < ncols_x && nth < max_block_size) nth *= 2;
+    if (nth>max_block_size) nth = max_block_size;
+
    const sycl::range<3> block_dims(1, 1, nth);
    const sycl::range<3> block_nums(1, 1, nrows_x);
    const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
-    static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");

    const uint32_t n_head_kv   = nrows_x/nrows_y;
    const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@@ -13094,6 +13095,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *

    const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
    if (n_local_scratch*sizeof(float) < local_mem_size) {
+        if (ncols_x > max_block_size) {
+            soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
+                                               max_bias, m0, m1, n_head_log2, block_nums,
+                                               block_dims, n_local_scratch, stream);
+            return;
+        }
        switch (ncols_x) {
            case 32:
                soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
@@ -15989,73 +15996,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
 static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                                 const ggml_tensor *src1,
                                 ggml_tensor *dst) try {
-#if 0
-    ggml_sycl_mul_mat_id_sycl(dst);
-    // TODO: mmq/mmv support
-#endif
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
-
-    const struct ggml_tensor * ids = src0;
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-
+    GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT &&
+                "mul_mat_id does not support split buffers");
+    const ggml_tensor *ids = dst->src[2];
    const dpct::queue_ptr stream = g_syclStreams[g_main_device][0];

-    if (ids->backend == GGML_BACKEND_TYPE_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        SYCL_CHECK(CHECK_TRY_ERROR(
-            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait()));
-        // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
-    } else {
-        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
-    }
+    const size_t nb11 = src1->nb[1];
+    const size_t nb1 = dst->nb[1];

-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
+    const int32_t id = ((int32_t *)dst->op_params)[0];
+    const int32_t n_as = src0->ne[2];

+    std::vector<char> ids_host(ggml_nbytes(ids));
+    const char *ids_dev = (const char *)ids->data;
+
+    SYCL_CHECK(CHECK_TRY_ERROR(
+        stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
+    SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
+
+    const ggml_tensor_extra_gpu *src0_extra =
+        (const ggml_tensor_extra_gpu *)src0->extra;
+    const ggml_tensor_extra_gpu *src1_extra =
+        (const ggml_tensor_extra_gpu *)src1->extra;
+    const ggml_tensor_extra_gpu *dst_extra =
+        (const ggml_tensor_extra_gpu *)dst->extra;
+
+    ggml_tensor_extra_gpu src0_row_extra;
    ggml_tensor_extra_gpu src1_row_extra;
    ggml_tensor_extra_gpu dst_row_extra;

+    ggml_tensor src0_row = *src0;
    ggml_tensor src1_row = *src1;
    ggml_tensor dst_row = *dst;

    src1_row.backend = GGML_BACKEND_TYPE_GPU;
    dst_row.backend  = GGML_BACKEND_TYPE_GPU;

+    src0_row.extra = &src0_row_extra;
    src1_row.extra = &src1_row_extra;
    dst_row.extra = &dst_row_extra;

-    char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
-    char * dst_original  =  dst->backend == GGML_BACKEND_TYPE_CPU ?
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
+    char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU
+                              ? (char *)src0->data
+                              : (char *)src0_extra->data_device[g_main_device];
+    char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU
+                              ? (char *)src1->data
+                              : (char *)src1_extra->data_device[g_main_device];
+    char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU
+                             ? (char *)dst->data
+                             : (char *)dst_extra->data_device[g_main_device];
+
+    src0_row.ne[2] = 1;
+    src0_row.ne[3] = 1;
+    src0_row.nb[3] = src0->nb[2];

    if (src1->ne[1] == 1) {
-        GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-        GGML_ASSERT(dst->backend  == GGML_BACKEND_TYPE_GPU);
-
        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            //int32_t row_id;
-            //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
-            //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
-
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+            const int32_t row_id =
+                *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] +
+                                   id * ids->nb[0]);

            GGML_ASSERT(row_id >= 0 && row_id < n_as);

-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+            src0_row_extra.data_device[g_main_device] =
+                src0_original + row_id * src0->nb[2];
+            src1_row_extra.data_device[g_main_device] =
+                src1_original + i01 * src1->nb[1];
+            dst_row_extra.data_device[g_main_device] =
+                dst_original + i01 * dst->nb[1];

-            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
-            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
-
-            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
-            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
-
-            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
+            ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);
        }
    } else {
        sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
@@ -16065,8 +16075,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();

        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
            int64_t num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
@@ -16079,7 +16087,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,

                SYCL_CHECK(CHECK_TRY_ERROR(
                    stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
-                                   src1_original + i01 * nb11, nb11).wait()));
+                                   src1_original + i01 * nb11, nb11)));
                num_src1_rows++;
            }

@@ -16087,6 +16095,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                continue;
            }

+            src0_row_extra.data_device[g_main_device] =
+                src0_original + row_id * src0->nb[2];
+
            src1_row.ne[1] = num_src1_rows;
            dst_row.ne[1] = num_src1_rows;

@@ -16098,7 +16109,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
            dst_row.nb[2] = num_src1_rows*nb1;
            dst_row.nb[3] = num_src1_rows*nb1;

-            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
+            ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row);

            num_src1_rows = 0;
            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -16112,7 +16123,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,

                SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
                    dst_original + i01 * nb1,
-                    dst_contiguous.get() + num_src1_rows * nb1, nb1).wait()));
+                    dst_contiguous.get() + num_src1_rows * nb1, nb1)));
                num_src1_rows++;
            }
        }
@@ -16814,11 +16825,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
    const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
    SYCL_CHECK(
        CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
-
+    char* host_buf = (char*)malloc(size);
+    memcpy(host_buf, data, size);
    SYCL_CHECK(
        CHECK_TRY_ERROR((*stream)
-                             .memcpy((char *)tensor->data + offset, data, size)
+                             .memcpy((char *)tensor->data + offset, host_buf, size)
                             .wait()));
+    free(host_buf);
 }
 catch (sycl::exception const &exc) {
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -17739,7 +17752,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons

 GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
    const int min_batch_size = 32;
-    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
+    return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
    GGML_UNUSED(backend);
 }

--- a/ggml.c
+++ b/ggml.c
@@ -4,6 +4,7 @@
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
+#include "sgemm.h"

 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -32,6 +33,10 @@
 #include <unistd.h>
 #endif

+#ifdef __ARM_FEATURE_MATMUL_INT8
+#undef GGML_USE_LLAMAFILE
+#endif
+
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -853,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 // simd mappings
 //

-#if defined(__ARM_NEON)
-#if !defined(__aarch64__)
-
-// 64-bit compatibility
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-#endif
-#endif
-
 // we define a common set of C macros which map to specific intrinsics based on the current architecture
 // we then implement the fundamental computation operations below using only these macros
 // adding support for new architectures requires to define the corresponding SIMD macros
@@ -4573,21 +4566,32 @@ void ggml_mul_mat_set_prec(

 // ggml_mul_mat_id

-// NOTE: id will be removed in the future and instead all the experts listed in ids will be computed
-//       this will allow computing all the used experts in a single matrix multiplication
+/*
+    c = ggml_mul_mat_id(ctx, as, b, ids);
+
+    as  -> [cols, rows, n_expert]
+    ids -> [n_experts_used, n_tokens] (i32)
+    b   -> [cols, n_expert_used, n_tokens]
+    c   -> [cols, n_expert_used, n_tokens]
+
+    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
+
+    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
+*/
 struct ggml_tensor * ggml_mul_mat_id(
        struct ggml_context * ctx,
        struct ggml_tensor  * as,
-        struct ggml_tensor  * ids,
-        int                   id,
-        struct ggml_tensor  * b) {
-
+        struct ggml_tensor  * b,
+        struct ggml_tensor  * ids) {
+    GGML_ASSERT(!ggml_is_transposed(as));
    GGML_ASSERT(ids->type == GGML_TYPE_I32);
+
+    GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
+    GGML_ASSERT(b->ne[3] == 1); // b is 3d
    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
-    GGML_ASSERT(ids->ne[1] == b->ne[1]); // must have an expert per b row
-    GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
-    GGML_ASSERT(id >= 0 && id < ids->ne[0]); // valid id
+    GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
    GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
+    GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast

    bool is_node = false;

@@ -4595,11 +4599,9 @@ struct ggml_tensor * ggml_mul_mat_id(
        is_node = true;
    }

-    const int64_t ne[4] = { as->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

-    ggml_set_op_params_i32(result, 0, id);
-
    result->op   = GGML_OP_MUL_MAT_ID;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = as;
@@ -10810,6 +10812,28 @@ static void ggml_compute_forward_mul_mat(
    }
 #endif

+#if GGML_USE_LLAMAFILE
+    if (src1_cont) {
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)src1->data + i12*nb12 + i13*nb13,
+                                     nb11/ggml_type_size(src1->type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     ith, nth,
+                                     params->type,
+                                     src0->type,
+                                     src1->type,
+                                     dst->type))
+                    goto UseGgmlGemm1;
+        return;
+    }
+UseGgmlGemm1:;
+#endif
+
    if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
            return;
@@ -10841,6 +10865,28 @@ static void ggml_compute_forward_mul_mat(
    const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
    const size_t row_size = ggml_row_size(vec_dot_type, ne10);

+#if GGML_USE_LLAMAFILE
+    if (src1->type != vec_dot_type) {
+        for (int64_t i13 = 0; i13 < ne13; i13++)
+            for (int64_t i12 = 0; i12 < ne12; i12++)
+                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
+                                     nb01/ggml_type_size(src0->type),
+                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
+                                     row_size/ggml_type_size(vec_dot_type),
+                                     (char *)dst->data + i12*nb2 + i13*nb3,
+                                     nb1/ggml_type_size(dst->type),
+                                     ith, nth,
+                                     params->type,
+                                     src0->type,
+                                     vec_dot_type,
+                                     dst->type))
+                    goto UseGgmlGemm2;
+        return;
+    }
+UseGgmlGemm2:;
+#endif
+
    const int64_t nr0 = ne01;          // src0 rows
    const int64_t nr1 = ne1*ne12*ne13; // src1 rows

@@ -10958,11 +11004,6 @@ static void ggml_compute_forward_mul_mat_id(
    enum ggml_type    const vec_dot_type          = type_traits[type].vec_dot_type;
    ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;

-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
@@ -10973,22 +11014,21 @@ static void ggml_compute_forward_mul_mat_id(
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);

-    // broadcast is not supported with mmid
-    assert(ne12 == 1);
-    assert(ne13 == 1);
-
    // row groups
-    const int id   = ggml_get_op_params_i32(dst, 0);
-    const int n_as = src0->ne[2];
+    const int n_ids = ids->ne[0]; // n_expert_used
+    const int n_as  = ne02;       // n_expert

    char * wdata_src1_end = (src1->type == vec_dot_type) ?
            (char *) params->wdata :
            (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));

-    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
-    int64_t * matrix_rows       = matrix_row_counts + n_as;     // [n_as][ne11]
+    struct mmid_row_mapping {
+        int32_t i1;
+        int32_t i2;
+    };

-    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
+    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
+    struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]

   if (params->type == GGML_TASK_TYPE_INIT) {
        if (ith != 0) {
@@ -11012,16 +11052,20 @@ static void ggml_compute_forward_mul_mat_id(
        }

        // initialize matrix_row_counts
-        GGML_ASSERT(wdata == wdata_src1_end);
        memset(matrix_row_counts, 0, n_as*sizeof(int64_t));

-        // group rows by src0 matrix
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]

-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-            MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01;
-            matrix_row_counts[row_id] += 1;
+        // group rows by src0 matrix
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
+            for (int id = 0; id < n_ids; ++id) {
+                const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]);
+
+                assert(i02 >= 0 && i02 < n_as);
+
+                MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1};
+                matrix_row_counts[i02] += 1;
+            }
        }

        return;
@@ -11039,15 +11083,13 @@ static void ggml_compute_forward_mul_mat_id(
            continue;
        }

-        size_t src0_offset = cur_a*src0->nb[2];
+        const char * src0_cur = (const char *) src0->data + cur_a*nb02;

        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
        const size_t row_size = ggml_row_size(vec_dot_type, ne10);

-        const int64_t nr0 = ne01;           // src0 rows
-        const int64_t nr1 = cne1*ne12*ne13; // src1 rows
-
-        //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+        const int64_t nr0 = ne01; // src0 rows
+        const int64_t nr1 = cne1; // src1 rows

        // distribute the thread work across the inner or outer loop based on which one is larger

@@ -11066,13 +11108,11 @@ static void ggml_compute_forward_mul_mat_id(
        const int64_t ir110 = dr1*ith1;
        const int64_t ir111 = MIN(ir110 + dr1, nr1);

-        //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
-
        // threads with no work simply yield (not sure if it helps)
-        if (ir010 >= ir011 || ir110 >= ir111) {
-            sched_yield();
-            continue;
-        }
+        //if (ir010 >= ir011 || ir110 >= ir111) {
+        //    sched_yield();
+        //    continue;
+        //}

        // block-tiling attempt
        const int64_t blck_0 = 16;
@@ -11084,20 +11124,16 @@ static void ggml_compute_forward_mul_mat_id(
        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                    const int64_t  i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix
-                    const int64_t  i12 = (ir1 - i13*ne12*cne1)/cne1;
-                    const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1);
-                    const int64_t  i11 = MMID_MATRIX_ROW(cur_a, _i11);
+                    const int64_t _i12 = ir1; // logical row index for this expert

-                    // broadcast src0 into src1
-                    //const int64_t i03 = i13/r3;
-                    //const int64_t i02 = i12/r2;
+                    struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
+                    const int id       = row_mapping.i1; // selected expert index

-                    const int64_t i1 = i11;
-                    const int64_t i2 = i12;
-                    const int64_t i3 = i13;
+                    const int64_t  i11 = id % ne11;
+                    const int64_t  i12 = row_mapping.i2; // row index in src1

-                    const char * src0_row = (const char *) src0->data + src0_offset;
+                    const int64_t  i1 = id;  // selected expert index
+                    const int64_t  i2 = i12; // row

                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
@@ -11105,25 +11141,26 @@ static void ggml_compute_forward_mul_mat_id(
                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
                    const char * src1_col = (const char *) wdata +
                        (src1_cont || src1->type != vec_dot_type
-                        ? (i11      + i12*ne11 + i13*ne12*ne11)*row_size
-                        : (i11*nb11 + i12*nb12 + i13*nb13));
+                        ? (i11      + i12*ne11)*row_size
+                        : (i11*nb11 + i12*nb12));

-                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
+                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));

                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
                    //}

                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1);
+                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
                    }
+
                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
                }
            }
        }
    }

-    #undef MMID_MATRIX_ROW
+#undef MMID_MATRIX_ROW
 }

 // ggml_compute_forward_out_prod
@@ -18462,7 +18499,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                    const int n_as = src0->ne[2];
                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
+                    cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
                } break;
            case GGML_OP_OUT_PROD:
                {
@@ -20550,8 +20587,34 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    return ok;
 }

+static void gguf_free_kv(struct gguf_kv * kv) {
+    if (kv->key.data) {
+        GGML_FREE(kv->key.data);
+    }
+
+    if (kv->type == GGUF_TYPE_STRING) {
+        if (kv->value.str.data) {
+            GGML_FREE(kv->value.str.data);
+        }
+    }
+
+    if (kv->type == GGUF_TYPE_ARRAY) {
+        if (kv->value.arr.data) {
+            if (kv->value.arr.type == GGUF_TYPE_STRING) {
+                for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
+                    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
+                    if (str->data) {
+                        GGML_FREE(str->data);
+                    }
+                }
+            }
+            GGML_FREE(kv->value.arr.data);
+        }
+    }
+}
+
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));

    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
@@ -20596,7 +20659,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    bool ok = true;

-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));

    // read the header
    {
@@ -20633,9 +20696,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    // read the kv pairs
    {
-        ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+        const uint64_t n_kv = ctx->header.n_kv;

-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
+        // header.n_kv will hold the actual value of pairs that were successfully read in the loop below
+        ctx->header.n_kv = 0;
+        ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
+
+        for (uint64_t i = 0; i < n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];

            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -20684,7 +20751,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                        return NULL;
                                    }

-                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
+                                    kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));

                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
                                } break;
@@ -20698,7 +20765,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                        return NULL;
                                    }

-                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
+                                    kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));

                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
@@ -20714,6 +20781,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            if (!ok) {
                break;
            }
+
+            ctx->header.n_kv++;
        }

        if (!ok) {
@@ -20726,7 +20795,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

    // read the tensor infos
    {
-        ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+        ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));

        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];
@@ -20747,6 +20816,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);

+            // TODO: return an error instead of crashing with GGML_ASSERT
            gguf_tensor_info_sanitize(info);

            if (!ok) {
@@ -20862,12 +20932,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

            ok = ok && cur != NULL;

-            ggml_set_name(cur, ctx->infos[i].name.data);
-
            if (!ok) {
                break;
            }

+            ggml_set_name(cur, ctx->infos[i].name.data);
+
            // point the data member to the appropriate location in the binary blob using the tensor infos
            if (!params.no_alloc) {
              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
@@ -20899,31 +20969,7 @@ void gguf_free(struct gguf_context * ctx) {
    if (ctx->kv) {
        // free string memory - not great..
        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-            struct gguf_kv * kv = &ctx->kv[i];
-
-            if (kv->key.data) {
-                GGML_FREE(kv->key.data);
-            }
-
-            if (kv->type == GGUF_TYPE_STRING) {
-                if (kv->value.str.data) {
-                    GGML_FREE(kv->value.str.data);
-                }
-            }
-
-            if (kv->type == GGUF_TYPE_ARRAY) {
-                if (kv->value.arr.data) {
-                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
-                        for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
-                            if (str->data) {
-                                GGML_FREE(str->data);
-                            }
-                        }
-                    }
-                    GGML_FREE(kv->value.arr.data);
-                }
-            }
+            gguf_free_kv(&ctx->kv[i]);
        }

        GGML_FREE(ctx->kv);
@@ -20941,7 +20987,7 @@ void gguf_free(struct gguf_context * ctx) {
        GGML_FREE(ctx->infos);
    }

-    GGML_ALIGNED_FREE(ctx);
+    GGML_FREE(ctx);
 }

 const char * gguf_type_name(enum gguf_type type) {
@@ -21148,6 +21194,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
    return n_kv;
 }

+void gguf_remove_key(struct gguf_context * ctx, const char * key) {
+    const int idx = gguf_find_key(ctx, key);
+    if (idx >= 0) {
+        const int n_kv = gguf_get_n_kv(ctx);
+        gguf_free_kv(&ctx->kv[idx]);
+        for (int i = idx; i < n_kv-1; ++i) {
+            ctx->kv[i] = ctx->kv[i+1];
+        }
+        ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
+        ctx->header.n_kv--;
+    }
+}
+
 void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

@@ -21239,7 +21298,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = type;
    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
+    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
 }

@@ -21249,7 +21308,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
+    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
    for (int i = 0; i < n; i++) {
        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
        str->n    = strlen(data[i]);
@@ -21276,7 +21335,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
+                        const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
@@ -21364,7 +21423,7 @@ struct gguf_buf {

 static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
-        /*buf.data   =*/ size == 0 ? NULL : GGML_MALLOC(size),
+        /*buf.data   =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
        /*buf.size   =*/ size,
        /*buf.offset =*/ 0,
    };
--- a/ggml.h
+++ b/ggml.h
@@ -762,6 +762,8 @@ extern "C" {
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);

+    GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
+
    // main

    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -1161,13 +1163,11 @@ extern "C" {
            enum ggml_prec       prec);

    // indirect matrix multiplication
-    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
            struct ggml_context * ctx,
            struct ggml_tensor  * as,
-            struct ggml_tensor  * ids,
-            int                   id,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * ids);

    // A: m columns, n rows,
    // B: p columns, n rows,
@@ -2289,6 +2289,9 @@ extern "C" {
    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);

+    // removes key if it exists
+    GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
+
    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -21,6 +21,8 @@ pip install gguf

 [scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files.

+[scripts/gguf-new-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-new-metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
+
 ## Development
 Maintainers who participate in development of this package are advised to install it in editable mode:

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -90,6 +90,13 @@ class Keys:
        HF_JSON          = "tokenizer.huggingface.json"
        RWKV             = "tokenizer.rwkv.world"
        CHAT_TEMPLATE    = "tokenizer.chat_template"
+        CHAT_TEMPLATE_N  = "tokenizer.chat_template.{name}"
+        CHAT_TEMPLATES   = "tokenizer.chat_templates"
+        # FIM/Infill special tokens constants
+        PREFIX_ID        = "tokenizer.ggml.prefix_token_id"
+        SUFFIX_ID        = "tokenizer.ggml.suffix_token_id"
+        MIDDLE_ID        = "tokenizer.ggml.middle_token_id"
+        EOT_ID           = "tokenizer.ggml.eot_token_id"


 #
@@ -115,7 +122,9 @@ class MODEL_ARCH(IntEnum):
    STABLELM   = auto()
    QWEN       = auto()
    QWEN2      = auto()
+    QWEN2MOE   = auto()
    PHI2       = auto()
+    PHI3       = auto()
    PLAMO      = auto()
    CODESHELL  = auto()
    ORION      = auto()
@@ -126,44 +135,50 @@ class MODEL_ARCH(IntEnum):
    MAMBA      = auto()
    XVERSE     = auto()
    COMMAND_R  = auto()
+    DBRX       = auto()
+    OLMO       = auto()


 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD      = auto()
-    TOKEN_EMBD_NORM = auto()
-    TOKEN_TYPES     = auto()
-    POS_EMBD        = auto()
-    OUTPUT          = auto()
-    OUTPUT_NORM     = auto()
-    ROPE_FREQS      = auto()
-    ATTN_Q          = auto()
-    ATTN_K          = auto()
-    ATTN_V          = auto()
-    ATTN_QKV        = auto()
-    ATTN_OUT        = auto()
-    ATTN_NORM       = auto()
-    ATTN_NORM_2     = auto()
-    ATTN_OUT_NORM   = auto()
-    ATTN_ROT_EMBD   = auto()
-    FFN_GATE_INP    = auto()
-    FFN_NORM        = auto()
-    FFN_GATE        = auto()
-    FFN_DOWN        = auto()
-    FFN_UP          = auto()
-    FFN_ACT         = auto()
-    FFN_GATE_EXP    = auto()
-    FFN_DOWN_EXP    = auto()
-    FFN_UP_EXP      = auto()
-    ATTN_Q_NORM     = auto()
-    ATTN_K_NORM     = auto()
-    LAYER_OUT_NORM  = auto()
-    SSM_IN          = auto()
-    SSM_CONV1D      = auto()
-    SSM_X           = auto()
-    SSM_DT          = auto()
-    SSM_A           = auto()
-    SSM_D           = auto()
-    SSM_OUT         = auto()
+    TOKEN_EMBD         = auto()
+    TOKEN_EMBD_NORM    = auto()
+    TOKEN_TYPES        = auto()
+    POS_EMBD           = auto()
+    OUTPUT             = auto()
+    OUTPUT_NORM        = auto()
+    ROPE_FREQS         = auto()
+    ATTN_Q             = auto()
+    ATTN_K             = auto()
+    ATTN_V             = auto()
+    ATTN_QKV           = auto()
+    ATTN_OUT           = auto()
+    ATTN_NORM          = auto()
+    ATTN_NORM_2        = auto()
+    ATTN_OUT_NORM      = auto()
+    ATTN_ROT_EMBD      = auto()
+    FFN_GATE_INP       = auto()
+    FFN_GATE_INP_SHEXP = auto()
+    FFN_NORM           = auto()
+    FFN_GATE           = auto()
+    FFN_DOWN           = auto()
+    FFN_UP             = auto()
+    FFN_ACT            = auto()
+    FFN_GATE_EXP       = auto()
+    FFN_DOWN_EXP       = auto()
+    FFN_UP_EXP         = auto()
+    FFN_GATE_SHEXP     = auto()
+    FFN_DOWN_SHEXP     = auto()
+    FFN_UP_SHEXP       = auto()
+    ATTN_Q_NORM        = auto()
+    ATTN_K_NORM        = auto()
+    LAYER_OUT_NORM     = auto()
+    SSM_IN             = auto()
+    SSM_CONV1D         = auto()
+    SSM_X              = auto()
+    SSM_DT             = auto()
+    SSM_A              = auto()
+    SSM_D              = auto()
+    SSM_OUT            = auto()


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -184,7 +199,9 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.STABLELM:       "stablelm",
    MODEL_ARCH.QWEN:           "qwen",
    MODEL_ARCH.QWEN2:          "qwen2",
+    MODEL_ARCH.QWEN2MOE:       "qwen2moe",
    MODEL_ARCH.PHI2:           "phi2",
+    MODEL_ARCH.PHI3:           "phi3",
    MODEL_ARCH.PLAMO:          "plamo",
    MODEL_ARCH.CODESHELL:      "codeshell",
    MODEL_ARCH.ORION:          "orion",
@@ -195,44 +212,50 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.MAMBA:          "mamba",
    MODEL_ARCH.XVERSE:         "xverse",
    MODEL_ARCH.COMMAND_R:      "command-r",
+    MODEL_ARCH.DBRX:           "dbrx",
+    MODEL_ARCH.OLMO:           "olmo",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
-    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
-    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
-    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
-    MODEL_TENSOR.POS_EMBD:        "position_embd",
-    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
-    MODEL_TENSOR.OUTPUT:          "output",
-    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
-    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
-    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
-    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
-    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
-    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
-    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
-    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
-    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
-    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
-    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
-    MODEL_TENSOR.ATTN_OUT_NORM:   "blk.{bid}.attn_output_norm",
-    MODEL_TENSOR.FFN_GATE_INP:    "blk.{bid}.ffn_gate_inp",
-    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
-    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
-    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
-    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
-    MODEL_TENSOR.FFN_ACT:         "blk.{bid}.ffn",
-    MODEL_TENSOR.FFN_GATE_EXP:    "blk.{bid}.ffn_gate_exps",
-    MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down_exps",
-    MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up_exps",
-    MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
-    MODEL_TENSOR.SSM_IN:          "blk.{bid}.ssm_in",
-    MODEL_TENSOR.SSM_CONV1D:      "blk.{bid}.ssm_conv1d",
-    MODEL_TENSOR.SSM_X:           "blk.{bid}.ssm_x",
-    MODEL_TENSOR.SSM_DT:          "blk.{bid}.ssm_dt",
-    MODEL_TENSOR.SSM_A:           "blk.{bid}.ssm_a",
-    MODEL_TENSOR.SSM_D:           "blk.{bid}.ssm_d",
-    MODEL_TENSOR.SSM_OUT:         "blk.{bid}.ssm_out",
+    MODEL_TENSOR.TOKEN_EMBD:         "token_embd",
+    MODEL_TENSOR.TOKEN_EMBD_NORM:    "token_embd_norm",
+    MODEL_TENSOR.TOKEN_TYPES:        "token_types",
+    MODEL_TENSOR.POS_EMBD:           "position_embd",
+    MODEL_TENSOR.OUTPUT_NORM:        "output_norm",
+    MODEL_TENSOR.OUTPUT:             "output",
+    MODEL_TENSOR.ROPE_FREQS:         "rope_freqs",
+    MODEL_TENSOR.ATTN_NORM:          "blk.{bid}.attn_norm",
+    MODEL_TENSOR.ATTN_NORM_2:        "blk.{bid}.attn_norm_2",
+    MODEL_TENSOR.ATTN_QKV:           "blk.{bid}.attn_qkv",
+    MODEL_TENSOR.ATTN_Q:             "blk.{bid}.attn_q",
+    MODEL_TENSOR.ATTN_K:             "blk.{bid}.attn_k",
+    MODEL_TENSOR.ATTN_V:             "blk.{bid}.attn_v",
+    MODEL_TENSOR.ATTN_OUT:           "blk.{bid}.attn_output",
+    MODEL_TENSOR.ATTN_ROT_EMBD:      "blk.{bid}.attn_rot_embd",
+    MODEL_TENSOR.ATTN_Q_NORM:        "blk.{bid}.attn_q_norm",
+    MODEL_TENSOR.ATTN_K_NORM:        "blk.{bid}.attn_k_norm",
+    MODEL_TENSOR.ATTN_OUT_NORM:      "blk.{bid}.attn_output_norm",
+    MODEL_TENSOR.FFN_GATE_INP:       "blk.{bid}.ffn_gate_inp",
+    MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
+    MODEL_TENSOR.FFN_NORM:           "blk.{bid}.ffn_norm",
+    MODEL_TENSOR.FFN_GATE:           "blk.{bid}.ffn_gate",
+    MODEL_TENSOR.FFN_DOWN:           "blk.{bid}.ffn_down",
+    MODEL_TENSOR.FFN_UP:             "blk.{bid}.ffn_up",
+    MODEL_TENSOR.FFN_GATE_SHEXP:     "blk.{bid}.ffn_gate_shexp",
+    MODEL_TENSOR.FFN_DOWN_SHEXP:     "blk.{bid}.ffn_down_shexp",
+    MODEL_TENSOR.FFN_UP_SHEXP:       "blk.{bid}.ffn_up_shexp",
+    MODEL_TENSOR.FFN_ACT:            "blk.{bid}.ffn",
+    MODEL_TENSOR.FFN_GATE_EXP:       "blk.{bid}.ffn_gate_exps",
+    MODEL_TENSOR.FFN_DOWN_EXP:       "blk.{bid}.ffn_down_exps",
+    MODEL_TENSOR.FFN_UP_EXP:         "blk.{bid}.ffn_up_exps",
+    MODEL_TENSOR.LAYER_OUT_NORM:     "blk.{bid}.layer_output_norm",
+    MODEL_TENSOR.SSM_IN:             "blk.{bid}.ssm_in",
+    MODEL_TENSOR.SSM_CONV1D:         "blk.{bid}.ssm_conv1d",
+    MODEL_TENSOR.SSM_X:              "blk.{bid}.ssm_x",
+    MODEL_TENSOR.SSM_DT:             "blk.{bid}.ssm_dt",
+    MODEL_TENSOR.SSM_A:              "blk.{bid}.ssm_a",
+    MODEL_TENSOR.SSM_D:              "blk.{bid}.ssm_d",
+    MODEL_TENSOR.SSM_OUT:            "blk.{bid}.ssm_out",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -438,6 +461,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
    ],
    MODEL_ARCH.QWEN: [
        MODEL_TENSOR.TOKEN_EMBD,
@@ -467,6 +492,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.QWEN2MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
    MODEL_ARCH.PLAMO: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.OUTPUT_NORM,
@@ -508,6 +552,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.PHI3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    MODEL_ARCH.CODESHELL: [
        MODEL_TENSOR.TOKEN_EMBD,
        MODEL_TENSOR.POS_EMBD,
@@ -642,6 +700,30 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_K_NORM,
        MODEL_TENSOR.ATTN_Q_NORM,
    ],
+    MODEL_ARCH.DBRX: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.OLMO: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    # TODO
 }

@@ -806,6 +888,7 @@ GGML_QUANT_SIZES = {
    GGMLQuantizationType.I32:     (1, 4),
    GGMLQuantizationType.I64:     (1, 8),
    GGMLQuantizationType.F64:     (1, 8),
+    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
 }


@@ -870,3 +953,7 @@ KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
 KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
+KEY_TOKENIZER_PRIFIX_ID  = Keys.Tokenizer.PREFIX_ID
+KEY_TOKENIZER_SUFFIX_ID  = Keys.Tokenizer.SUFFIX_ID
+KEY_TOKENIZER_MIDDLE_ID  = Keys.Tokenizer.MIDDLE_ID
+KEY_TOKENIZER_EOT_ID     = Keys.Tokenizer.EOT_ID
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -6,7 +6,8 @@ import struct
 import tempfile
 from enum import Enum, auto
 from io import BufferedWriter
-from typing import IO, Any, Sequence
+from typing import IO, Any, Sequence, Mapping
+from string import ascii_letters, digits

 import numpy as np

@@ -466,9 +467,47 @@ class GGUFWriter:
    def add_add_space_prefix(self, value: bool) -> None:
        self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)

-    def add_chat_template(self, value: str) -> None:
+    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
+        if isinstance(value, list):
+            template_default = None
+            template_names = set()
+
+            for choice in value:
+                name = choice.get('name', '')
+                template = choice.get('template')
+
+                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
+                name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
+
+                if name and template is not None:
+                    if name == 'default':
+                        template_default = template
+                    else:
+                        template_names.add(name)
+                        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
+
+            if template_names:
+                self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
+
+            if template_default is None:
+                return
+
+            value = template_default
+
        self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)

+    def add_prefix_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
+
+    def add_suffix_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
+
+    def add_middle_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
+
+    def add_eot_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.EOT_ID, id)
+
    def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
        pack_prefix = ''
        if not skip_pack_prefix:
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -10,7 +10,7 @@ class TensorNameMap:
        # Token embeddings
        MODEL_TENSOR.TOKEN_EMBD: (
            "gpt_neox.embed_in",                         # gptneox
-            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen
+            "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx
            "transformer.word_embeddings",               # falcon
            "word_embeddings",                           # bloom
            "model.embed_tokens",                        # llama-hf
@@ -48,7 +48,7 @@ class TensorNameMap:
        # Output
        MODEL_TENSOR.OUTPUT: (
            "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
            "output",                    # llama-pth bloom internlm2
            "word_embeddings_for_head",  # persimmon
            "lm_head.linear",            # phi2
@@ -60,7 +60,7 @@ class TensorNameMap:
            "transformer.ln_f",                        # gpt2 gpt-j falcon
            "model.norm",                              # llama-hf baichuan internlm2
            "norm",                                    # llama-pth
-            "transformer.norm_f",                      # mpt
+            "transformer.norm_f",                      # mpt dbrx
            "ln_f",                                    # refact bloom qwen gpt2
            "language_model.encoder.final_layernorm",  # persimmon
            "model.final_layernorm",                   # persimmon
@@ -96,6 +96,7 @@ class TensorNameMap:
            "model.layers.{bid}.norm",                              # mamba-qbert
            "backbone.layers.{bid}.norm",                           # mamba
            "transformer.decoder_layer.{bid}.rms_norm",             # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_1",       # dbrx
        ),

        # Attention norm 2
@@ -108,6 +109,7 @@ class TensorNameMap:
            "gpt_neox.layers.{bid}.attention.query_key_value",                     # gptneox
            "transformer.h.{bid}.attn.c_attn",                                     # gpt2 qwen
            "transformer.blocks.{bid}.attn.Wqkv",                                  # mpt
+            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",                   # dbrx
            "transformer.h.{bid}.self_attention.query_key_value",                  # falcon
            "h.{bid}.self_attention.query_key_value",                              # bloom
            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
@@ -115,6 +117,7 @@ class TensorNameMap:
            "h.{bid}.attn.c_attn",                                                 # gpt2
            "transformer.h.{bid}.mixer.Wqkv",                                      # phi2
            "encoder.layers.{bid}.attn.Wqkv",                                      # nomic-bert
+            "model.layers.{bid}.self_attn.qkv_proj"                                # phi3
        ),

        # Attention query
@@ -152,23 +155,24 @@ class TensorNameMap:

        # Attention output
        MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",                     # gptneox
-            "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact qwen
-            "transformer.blocks.{bid}.attn.out_proj",                    # mpt
-            "transformer.h.{bid}.self_attention.dense",                  # falcon
-            "h.{bid}.self_attention.dense",                              # bloom
-            "model.layers.{bid}.self_attn.o_proj",                       # llama-hf
-            "layers.{bid}.attention.wo",                                 # llama-pth
-            "encoder.layer.{bid}.attention.output.dense",                # bert
-            "transformer.h.{bid}.attn.out_proj",                         # gpt-j
-            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
-            "model.layers.{bid}.self_attn.dense",                        # persimmon
-            "h.{bid}.attn.c_proj",                                       # gpt2
-            "transformer.h.{bid}.mixer.out_proj",                        # phi2
-            "model.layers.layers.{bid}.self_attn.o_proj",                # plamo
-            "model.layers.{bid}.attention.wo",                           # internlm2
-            "encoder.layers.{bid}.attn.out_proj",                        # nomic-bert
-            "transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
+            "gpt_neox.layers.{bid}.attention.dense",                        # gptneox
+            "transformer.h.{bid}.attn.c_proj",                              # gpt2 refact qwen
+            "transformer.blocks.{bid}.attn.out_proj",                       # mpt
+            "transformer.h.{bid}.self_attention.dense",                     # falcon
+            "h.{bid}.self_attention.dense",                                 # bloom
+            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf
+            "layers.{bid}.attention.wo",                                    # llama-pth
+            "encoder.layer.{bid}.attention.output.dense",                   # bert
+            "transformer.h.{bid}.attn.out_proj",                            # gpt-j
+            "language_model.encoder.layers.{bid}.self_attention.dense",     # persimmon
+            "model.layers.{bid}.self_attn.dense",                           # persimmon
+            "h.{bid}.attn.c_proj",                                          # gpt2
+            "transformer.h.{bid}.mixer.out_proj",                           # phi2
+            "model.layers.layers.{bid}.self_attn.o_proj",                   # plamo
+            "model.layers.{bid}.attention.wo",                              # internlm2
+            "encoder.layers.{bid}.attn.out_proj",                           # nomic-bert
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",        # dbrx
        ),

        # Attention output norm
@@ -176,6 +180,7 @@ class TensorNameMap:
            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
            "encoder.layers.{bid}.norm1",                      # nomic-bert
            "transformer.decoder_layer.{bid}.rms_norm_1",      # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
        ),

        # Rotary embeddings
@@ -202,9 +207,15 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_GATE_INP: (
-            "layers.{bid}.feed_forward.gate",           # mixtral
-            "model.layers.{bid}.block_sparse_moe.gate", # mixtral
-            "transformer.decoder_layer.{bid}.router"    # Grok
+            "layers.{bid}.feed_forward.gate",             # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
+            "model.layers.{bid}.mlp.gate",                # qwen2moe
+            "transformer.decoder_layer.{bid}.router",     # Grok
+            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
+        ),
+
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
        ),

        # Feed-forward up
@@ -224,6 +235,7 @@ class TensorNameMap:
            "h.{bid}.mlp.c_fc",                                       # gpt2
            "transformer.h.{bid}.mlp.fc1",                            # phi2
            "model.layers.{bid}.mlp.fc1",                             # phi2
+            "model.layers.{bid}.mlp.gate_up_proj",                    # phi3
            "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
            "model.layers.{bid}.feed_forward.w3",                     # internlm2
            "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
@@ -231,8 +243,14 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_UP_EXP: (
-            "layers.{bid}.feed_forward.experts.w3",                 # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear_v",         # Grok (merged)
+            "layers.{bid}.feed_forward.experts.w3",          # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_v",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.v1",   # dbrx
+            "model.layers.{bid}.mlp.experts.up_proj",        # qwen2moe (merged)
+        ),
+
+        MODEL_TENSOR.FFN_UP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
        ),

        # AWQ-activation gate
@@ -251,8 +269,14 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_GATE_EXP: (
-            "layers.{bid}.feed_forward.experts.w1",                 # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear"            # Grok (merged)
+            "layers.{bid}.feed_forward.experts.w1",         # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear",   # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
+            "model.layers.{bid}.mlp.experts.gate_proj",     # qwen2moe (merged)
+        ),
+
+        MODEL_TENSOR.FFN_GATE_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
        ),

        # Feed-forward down
@@ -278,8 +302,14 @@ class TensorNameMap:
        ),

        MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.w2",                 # mixtral (merged)
-            "transformer.decoder_layer.{bid}.moe.linear_1",         # Grok (merged)
+            "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx
+            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe (merged)
+        ),
+
+        MODEL_TENSOR.FFN_DOWN_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
@@ -358,7 +388,7 @@ class TensorNameMap:
                if tensor not in MODEL_TENSORS[arch]:
                    continue
                # TODO: make this configurable
-                n_experts = 8
+                n_experts = 60
                for xid in range(n_experts):
                    tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
                    self.mapping[tensor_name] = (tensor, tensor_name)
--- a/Show More
+++ b/Show More