[SYCL] fix no file in win rel (#6314 )

wpm : portable unicode tolower (#6305 )
Also use C locale for ispunct/isspace, and split unicode-data.cpp from unicode.cpp.
2026-02-12 14:03:20 +02:00 · 2024-03-27 09:47:06 +08:00 · 2024-03-26 17:46:21 -04:00 · 2024-03-26 16:46:41 +02:00 · 2024-03-26 15:21:27 +01:00 · 2024-03-26 14:32:19 +02:00
190 changed files with 29930 additions and 18811 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -12,6 +12,7 @@ Checks: >
    -readability-implicit-bool-conversion,
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
+    -readability-simplify-boolean-expr,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -26,8 +26,8 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1

 RUN make

--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@@ -12,7 +12,7 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.

-Name:           llama.cpp-cublas
+Name:           llama.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master

 %build
-make -j LLAMA_CUBLAS=1
+make -j LLAMA_CUDA=1

 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple

 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
+ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never

@@ -67,10 +67,10 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*

 %files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
+%{_bindir}/llamacppcuda
+%{_bindir}/llamacppcudaserver
+%{_bindir}/llamacppcudasimple
+/usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama

 %pre
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1

 RUN make

--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -4,13 +4,14 @@
  config,
  stdenv,
  mkShell,
+  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
+  blas,
  cudaPackages,
  darwin,
  rocmPackages,
@@ -35,7 +36,8 @@
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
-  enableStatic ? effectiveStdenv.hostPlatform.isStatic
+  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
+  precompileMetalShaders ? false
 }@inputs:

 let
@@ -87,6 +89,11 @@ let
    ]
  );

+  xcrunHost = runCommand "xcrunHost" {} ''
+    mkdir -p $out/bin
+    ln -s /usr/bin/xcrun $out/bin
+  '';
+
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
  darwinBuildInputs =
@@ -150,6 +157,8 @@ effectiveStdenv.mkDerivation (
    postPatch = ''
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+      substituteInPlace ./ggml-metal.m \
+        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"

      # TODO: Package up each Python script or service appropriately.
      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
@@ -157,6 +166,14 @@ effectiveStdenv.mkDerivation (
      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';

+    # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+    # `default.metallib` may be compiled with Metal compiler from XCode
+    # and we need to escape sandbox on MacOS to access Metal compiler.
+    # `xcrun` is used find the path of the Metal compiler, which is varible
+    # and not on $PATH
+    # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+
    nativeBuildInputs =
      [
        cmake
@@ -173,6 +190,8 @@ effectiveStdenv.mkDerivation (
      ]
      ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
        glibc.static
+      ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [
+        xcrunHost
      ];

    buildInputs =
@@ -181,6 +200,7 @@ effectiveStdenv.mkDerivation (
      ++ optionals useMpi [ mpi ]
      ++ optionals useOpenCL [ clblast ]
      ++ optionals useRocm rocmBuildInputs
+      ++ optionals useBlas [ blas ]
      ++ optionals useVulkan vulkanBuildInputs;

    cmakeFlags =
@@ -191,7 +211,7 @@ effectiveStdenv.mkDerivation (
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
        (cmakeBool "LLAMA_BLAS" useBlas)
        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
+        (cmakeBool "LLAMA_CUDA" useCuda)
        (cmakeBool "LLAMA_HIPBLAS" useRocm)
        (cmakeBool "LLAMA_METAL" useMetalKit)
        (cmakeBool "LLAMA_MPI" useMpi)
@@ -216,8 +236,10 @@ effectiveStdenv.mkDerivation (
        # Should likely use `rocmPackages.clr.gpuTargets`.
        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
      ]
-      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
-      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
+      ++ optionals useMetalKit [
+        (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+        (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+      ];

    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
    # if they haven't been added yet.
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .

 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1

 RUN make

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,14 +15,133 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1

 jobs:
+  macOS-latest-cmake-arm64:
+    runs-on: macos-14
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip
+
+  macOS-latest-cmake-x64:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          mkdir build
+          cd build
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
+          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest -L main --verbose --timeout 900
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          cp LICENSE ./build/bin/
+          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
+
  ubuntu-focal-make:
    runs-on: ubuntu-20.04
+    env:
+      LLAMA_NODE_AVAILABLE: true
+      LLAMA_PYTHON_AVAILABLE: true

    steps:
      - name: Clone
@@ -35,6 +154,14 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential gcc-8

+      - uses: actions/setup-node@v4
+        with:
+          node-version: "20"
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+
      - name: Build
        id: make_build
        env:
@@ -48,6 +175,28 @@ jobs:
          CC=gcc-8 make tests -j $(nproc)
          make test -j $(nproc)

+  ubuntu-focal-make-curl:
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
+
+      - name: Build
+        id: make_build
+        env:
+          LLAMA_FATAL_WARNINGS: 1
+          LLAMA_CURL: 1
+        run: |
+          CC=gcc-8 make -j $(nproc)
+
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest

@@ -76,40 +225,51 @@ jobs:
          cd build
          ctest -L main --verbose --timeout 900

-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
+      - name: Test llama2c conversion
+        id: llama2c_test
        run: |
          cd build
-          ctest -L main --verbose --timeout 900
+          echo "Fetch tokenizer"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin
+          echo "Fetch llama2c model"
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
+          ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
+          ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
+
+#  ubuntu-latest-cmake-sanitizer:
+#    runs-on: ubuntu-latest
+#
+#    continue-on-error: true
+#
+#    strategy:
+#      matrix:
+#        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+#        build_type: [Debug, Release]
+#
+#    steps:
+#      - name: Clone
+#        id: checkout
+#        uses: actions/checkout@v3
+#
+#      - name: Dependencies
+#        id: depends
+#        run: |
+#          sudo apt-get update
+#          sudo apt-get install build-essential
+#
+#      - name: Build
+#        id: cmake_build
+#        run: |
+#          mkdir build
+#          cd build
+#          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+#          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+#
+#      - name: Test
+#        id: cmake_test
+#        run: |
+#          cd build
+#          ctest -L main --verbose --timeout 900

  ubuntu-latest-cmake-mpi:
    runs-on: ubuntu-latest
@@ -568,13 +728,13 @@ jobs:
          path: |
            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip

-  windows-latest-cmake-cublas:
+  windows-latest-cmake-cuda:
    runs-on: windows-latest

    strategy:
      matrix:
        cuda: ['12.2.0', '11.7.1']
-        build: ['cublas']
+        build: ['cuda']

    steps:
      - name: Clone
@@ -595,7 +755,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

      - name: Determine tag name
@@ -640,6 +800,7 @@ jobs:

  windows-latest-cmake-sycl:
    runs-on: windows-latest
+
    defaults:
      run:
        shell: bash
@@ -648,7 +809,6 @@ jobs:
      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel

-
    steps:
      - name: Clone
        id: checkout
@@ -663,6 +823,32 @@ jobs:
        id: cmake_build
        run:  examples/sycl/win-build-sycl.bat

+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+
  ios-xcode-build:
    runs-on: macos-latest

@@ -725,7 +911,9 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-cublas
+      - windows-latest-cmake-cuda
+      - macOS-latest-cmake-arm64
+      - macOS-latest-cmake-x64

    steps:
      - name: Clone
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@@ -0,0 +1,23 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "42 0 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v5
+        with:
+          exempt-issue-labels: "refactor,help wanted,good first issue,research"
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          operations-per-run: 10000
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -5,6 +5,10 @@ env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  run:
    runs-on: ubuntu-20.04
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -15,6 +15,10 @@ on:
    branches:
      - master

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -14,6 +14,10 @@ on:
    branches:
      - master

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -17,6 +17,10 @@ on:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  nix-build-aarch64:
    runs-on: ubuntu-latest
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -8,6 +8,10 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  nix-eval:
    strategy:
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -16,6 +16,10 @@ on:
      - 'requirements.txt'
      - 'requirements/*.txt'

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  python-check-requirements:
    runs-on: ubuntu-latest
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -2,6 +2,10 @@ name: flake8 Lint

 on: [push, pull_request]

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  flake8-lint:
    runs-on: ubuntu-latest
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -18,20 +18,23 @@ on:
  schedule:
    -  cron: '0 0 * * *'

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  server:
    runs-on: ubuntu-latest

    strategy:
      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        # TODO: temporary disabled due to linux kernel issues
+        #sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        sanitizer: [UNDEFINED]
        build_type: [Debug]
        include:
          - build_type: Release
            sanitizer: ""
-          - build_type: Debug
-            sanitizer: THREAD
-            disabled_on_pr: true
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken

    container:
@@ -57,7 +60,8 @@ jobs:
            cmake \
            python3-pip \
            wget \
-            language-pack-en
+            language-pack-en \
+            libcurl4-openssl-dev

      - name: Build
        id: cmake_build
@@ -67,6 +71,7 @@ jobs:
          cmake .. \
              -DLLAMA_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
@@ -101,12 +106,21 @@ jobs:
        with:
          fetch-depth: 0

+      - name: libCURL
+        id: get_libcurl
+        env:
+          CURL_VERSION: 8.6.0_6
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
+          mkdir $env:RUNNER_TEMP/libcurl
+          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake ..  -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server

      - name: Python setup
@@ -120,6 +134,11 @@ jobs:
        run: |
          pip install -r examples/server/tests/requirements.txt

+      - name: Copy Libcurl
+        id: prepare_libcurl
+        run: |
+          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -6,6 +6,10 @@ on:
    branches:
      - master

+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
  build:
    strategy:
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,10 @@
 *.gcda
 *.dot
 *.bat
+*.tmp
 *.metallib
+*.etag
+*.lastModified
 .DS_Store
 .build/
 .cache/
@@ -47,6 +50,7 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
+/gguf-split
 /gritlm
 /imatrix
 /infill
@@ -55,6 +59,9 @@ models-mnt
 /llava-cli
 /lookahead
 /lookup
+/lookup-create
+/lookup-merge
+/lookup-stats
 /main
 /metal
 /passkey
@@ -70,6 +77,7 @@ models-mnt
 /batched-bench
 /export-lora
 /finetune
+/retrieval
 /speculative
 /parallel
 /train-text-from-scratch
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,8 +89,8 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
-#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
+option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA (deprecated, use LLAMA_CUDA)"     OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -99,6 +99,8 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
+option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
+option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
@@ -358,18 +360,25 @@ if (LLAMA_QKK_64)
 endif()

 if (LLAMA_CUBLAS)
+    message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
+    set(LLAMA_CUDA ON)
+endif()
+
+if (LLAMA_CUDA)
    cmake_minimum_required(VERSION 3.17)

    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
+        message(STATUS "CUDA found")

        enable_language(CUDA)

        set(GGML_HEADERS_CUDA ggml-cuda.h)
-        set(GGML_SOURCES_CUDA ggml-cuda.cu)

-        add_compile_definitions(GGML_USE_CUBLAS)
+        file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
+        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
+
+        add_compile_definitions(GGML_USE_CUDA)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -386,6 +395,9 @@ if (LLAMA_CUBLAS)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+        if (LLAMA_CUDA_NO_PEER_COPY)
+            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+        endif()

        if (LLAMA_STATIC)
            if (WIN32)
@@ -415,7 +427,7 @@ if (LLAMA_CUBLAS)
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    else()
-        message(WARNING "cuBLAS not found")
+        message(WARNING "CUDA not found")
    endif()
 endif()

@@ -514,9 +526,11 @@ if (LLAMA_HIPBLAS)
    message(STATUS "HIP and hipBLAS found")

    set(GGML_HEADERS_ROCM ggml-cuda.h)
-    set(GGML_SOURCES_ROCM ggml-cuda.cu)

-    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+    file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
+    list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
+
+    add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)

    if (LLAMA_HIP_UMA)
        add_compile_definitions(GGML_HIP_UMA)
@@ -530,11 +544,15 @@ if (LLAMA_HIPBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()

+    if (LLAMA_CUDA_NO_PEER_COPY)
+        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
+    endif()
+
    add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
    add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
    add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

-    set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
+    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)

    if (LLAMA_STATIC)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
@@ -817,7 +835,7 @@ endif()

 set(CUDA_CXX_FLAGS "")

-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
    set(CUDA_FLAGS -use_fast_math)

    if (LLAMA_FATAL_WARNINGS)
@@ -1042,7 +1060,7 @@ endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
@@ -1152,6 +1170,7 @@ add_library(llama
            llama.h
            unicode.h
            unicode.cpp
+            unicode-data.cpp
            )

 target_include_directories(llama PUBLIC .)
@@ -1247,6 +1266,12 @@ if (LLAMA_METAL)
            GROUP_READ
            WORLD_READ
        DESTINATION ${CMAKE_INSTALL_BINDIR})
+    if (NOT LLAMA_METAL_EMBED_LIBRARY)
+        install(
+            FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            DESTINATION ${CMAKE_INSTALL_BINDIR}
+        )
+    endif()
 endif()

 #
--- a/105
+++ b/105
@@ -1,15 +1,16 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
+	simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search  \
+	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
+	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
+	tests/test-json-schema-to-grammar

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -389,14 +390,20 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
+# LLAMA_CUBLAS is deprecated and will be removed in the future
+	LLAMA_CUDA := 1
+endif
+
+ifdef LLAMA_CUDA
 	ifneq ('', '$(wildcard /opt/cuda)')
 		CUDA_PATH ?= /opt/cuda
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
+	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
 	MK_NVCCFLAGS += -use_fast_math
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
@@ -451,19 +458,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-#ifdef LLAMA_CUDA_CUBLAS
-#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
-#endif # LLAMA_CUDA_CUBLAS
+ifdef LLAMA_CUDA_NO_PEER_COPY
+	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
 ifdef LLAMA_CUDA_CCBIN
 	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
+
 ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+define NVCC_COMPILE
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
 else
+define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
 endif # JETSON_EOL_MODULE_DETECT
-endif # LLAMA_CUBLAS
+
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+	$(NVCC_COMPILE)
+
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
+	$(NVCC_COMPILE)
+
+endif # LLAMA_CUDA

 ifdef LLAMA_CLBLAST

@@ -509,7 +527,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
 endif # LLAMA_VULKAN

 ifdef LLAMA_HIPBLAS
-
 	ifeq ($(wildcard /opt/rocm),)
 		ROCM_PATH	?= /usr
 		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -521,7 +538,7 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
 ifdef LLAMA_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
@@ -534,9 +551,17 @@ endif # LLAMA_HIP_UMA
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
+ifdef LLAMA_CUDA_NO_PEER_COPY
+	HIPFLAGS 	+= -DGGML_CUDA_NO_PEER_COPY
+endif # LLAMA_CUDA_NO_PEER_COPY
 	OBJS        += ggml-cuda.o
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	OBJS        += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+
+ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+
 endif # LLAMA_HIPBLAS

 ifdef LLAMA_METAL
@@ -589,12 +614,17 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)

 # identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif

+ifdef LLAMA_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS  := $(LDFLAGS) -lcurl
+endif
+
 #
 # Print build information
 #
@@ -609,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
 $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -619,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 $(info )

+ifdef LLAMA_CUBLAS
+$(info !!!!)
+$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
+$(info !!!!)
+$(info )
+endif
+
 #
 # Build library
 #
@@ -641,7 +678,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
 unicode.o: unicode.cpp unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
+unicode-data.o: unicode-data.cpp unicode-data.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o

 llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -661,9 +701,15 @@ console.o: common/console.cpp common/console.h
 grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 train.o: common/train.cpp common/train.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

+ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@@ -671,7 +717,8 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
 	ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf ggml-cuda/*.o
 	find examples pocs -type f -name "*.o" -delete

 #
@@ -740,7 +787,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

@@ -748,6 +795,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -785,6 +836,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -797,9 +852,15 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)

 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@@ -856,6 +917,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -32,6 +32,7 @@ let package = Package(
                "ggml.c",
                "llama.cpp",
                "unicode.cpp",
+                "unicode-data.cpp",
                "ggml-alloc.c",
                "ggml-backend.c",
                "ggml-quants.c",
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,6 +29,8 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 ## News

 - 2024.3
+  - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
+  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
  - Support detecting all GPUs with level-zero and same top **Max compute units**.
@@ -81,7 +83,7 @@ For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, rec
 |-|-|-|
 |Ampere Series| Support| A100|

-### oneMKL
+### oneMKL for CUDA

 The current oneMKL release does not contain the oneMKL cuBlas backend.
 As a result for Nvidia GPU's oneMKL must be built from source.
@@ -114,7 +116,7 @@ You can choose between **F16** and **F32** build. F16 is faster for long-prompt
 # Or, for F32:
 docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .

-# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
+# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example
 ```

 ### Run
@@ -254,16 +256,16 @@ Run without parameter:
 Check the ID in startup log, like:

 ```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
+found 6 SYCL devices:
+|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
+|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
+| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
+| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
+| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
+| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```

 |Attribute|Note|
@@ -271,12 +273,35 @@ found 4 SYCL devices:
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

-4. Set device ID and execute llama.cpp
+4. Device selection and execution of llama.cpp

-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+There are two device selection modes:
+
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+
+|Device selection|Parameter|
+|-|-|
+|Single device|--split-mode none --main-gpu DEVICE_ID |
+|Multiple devices|--split-mode layer (default)|
+
+Examples:
+
+- Use device 0:

 ```sh
-GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
+```
+or run by script:
+
+```sh
+./examples/sycl/run_llama2.sh 0
+```
+
+- Use multiple devices:
+
+```sh
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
 ```
 or run by script:

@@ -289,12 +314,18 @@ Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.


-5. Check the device ID in output
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:

-Like:
 ```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+detect 1 SYCL GPUs: [0] with top Max compute units:512
 ```
+Or
+```
+use 1 SYCL GPUs: [0] with Max compute units:512
+```
+

 ## Windows

@@ -355,7 +386,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/

 b. Download & install mingw-w64 make for Windows provided by w64devkit

- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).

 - Extract `w64devkit` on your pc.

@@ -430,15 +461,16 @@ build\bin\main.exe
 Check the ID in startup log, like:

 ```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+found 6 SYCL devices:
+|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
+|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
+|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
+| 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
+| 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
+| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
+| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
+| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
+| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|

 ```

@@ -447,13 +479,31 @@ found 4 SYCL devices:
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|

-4. Set device ID and execute llama.cpp

-Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
+4. Device selection and execution of llama.cpp
+
+There are two device selection modes:
+
+- Single device: Use one device assigned by user.
+- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+
+|Device selection|Parameter|
+|-|-|
+|Single device|--split-mode none --main-gpu DEVICE_ID |
+|Multiple devices|--split-mode layer (default)|
+
+Examples:
+
+- Use device 0:

 ```
-set GGML_SYCL_DEVICE=0
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
+```
+
+- Use multiple devices:
+
+```
+build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
 ```
 or run by script:

@@ -466,11 +516,17 @@ Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.


-5. Check the device ID in output

-Like:
+5. Verify the device ID in output
+
+Verify to see if the selected GPU is shown in the output, like:
+
 ```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+detect 1 SYCL GPUs: [0] with top Max compute units:512
+```
+Or
+```
+use 1 SYCL GPUs: [0] with Max compute units:512
 ```

 ## Environment Variable
@@ -489,7 +545,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device

 |Name|Value|Function|
 |-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|

@@ -507,6 +562,9 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device

 ## Q&A

+Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible.
+
+
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

  Miss to enable oneAPI running environment.
@@ -538,4 +596,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device

 ## Todo

- Support multiple cards.
+- Support row layer split for multiple card runs.
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Recent API changes

+- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
@@ -17,10 +18,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ### Hot topics

+- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
 - Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
+- Support loading sharded model, using `gguf-split` CLI https://github.com/ggerganov/llama.cpp/pull/6187

 ----

@@ -112,6 +115,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
 - [x] [Mamba](https://github.com/state-spaces/mamba)
+- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)

 **Multimodal models:**

@@ -133,6 +137,7 @@ Typically finetunes of the base models below are supported as well.
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
 - JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
+- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
@@ -163,6 +168,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT)
 - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
 - [pythops/tenere](https://github.com/pythops/tenere) (AGPL)
+- [RecurseChat](https://recurse.chat/) (proprietary)
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
@@ -443,30 +449,27 @@ Building the program with BLAS support may lead to some performance improvements

  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.

- #### cuBLAS
+- #### CUDA

-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.

  - Using `make`:
    ```bash
-    make LLAMA_CUBLAS=1
+    make LLAMA_CUDA=1
    ```
  - Using `CMake`:

    ```bash
    mkdir build
    cd build
-    cmake .. -DLLAMA_CUBLAS=ON
+    cmake .. -DLLAMA_CUDA=ON
    cmake --build . --config Release
    ```

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

-<!---
-  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
--->
  | Option                         | Legal values           | Default | Description |
  |--------------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
@@ -628,6 +631,15 @@ Building the program with BLAS support may lead to some performance improvements

 - #### Vulkan

+> [!WARNING]
+>
+> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
+> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
+> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
+> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
+>
+> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
+
  **With docker**:

  You don't need to install Vulkan SDK. It will be installed inside the container.
--- a/build.zig
+++ b/build.zig
@@ -116,24 +116,26 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const unicode = make.obj("unicode", "unicode.cpp");
+    const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
    const llama = make.obj("llama", "llama.cpp");
    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    const llava = make.obj("llava", "examples/llava/llava.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi

 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
 fi

 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log

    python3 ../convert.py ${path_models}

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,6 +47,8 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

+set(TARGET json-schema-to-grammar)
+add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h)

 set(TARGET common)

@@ -60,14 +62,28 @@ add_library(${TARGET} STATIC
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
+    json.hpp
    train.h
    train.cpp
+    ngram-cache.h
+    ngram-cache.cpp
    )

 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

+set(LLAMA_COMMON_EXTRA_LIBS build_info)
+
+# Use curl to download model url
+if (LLAMA_CURL)
+    find_package(CURL REQUIRED)
+    add_definitions(-DLLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    find_library(CURL_LIBRARY curl REQUIRED)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
+endif ()
+
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
 extern char const *LLAMA_COMPILER;
 extern char const *LLAMA_BUILD_TARGET;

+struct llama_control_vector_load_info;
+
+int32_t get_num_physical_cores();
+
 //
 // CLI argument parsing
 //
-int32_t get_num_physical_cores();

 struct gpt_params {
    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
@@ -85,17 +88,22 @@ struct gpt_params {
    // // sampling parameters
    struct llama_sampling_params sparams;

-    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_draft       = "";                              // draft model for speculative decoding
-    std::string model_alias       = "unknown"; // model alias
-    std::string prompt            = "";
-    std::string prompt_file       = "";  // store the external prompt file name
-    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix      = "";  // string to prefix user inputs with
-    std::string input_suffix      = "";  // string to suffix user inputs with
+    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_draft          = "";  // draft model for speculative decoding
+    std::string model_alias          = "unknown"; // model alias
+    std::string model_url            = "";  // model url to download
+    std::string hf_repo              = "";  // HF repo
+    std::string hf_file              = "";  // HF file
+    std::string prompt               = "";
+    std::string prompt_file          = "";  // store the external prompt file name
+    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
+    std::string input_prefix         = "";  // string to prefix user inputs with
+    std::string input_suffix         = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir            = "";  // directory in which to save YAML log files
-    std::string logits_file       = "";  // file for saving *all* logits
+    std::string logdir               = "";  // directory in which to save YAML log files
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
+    std::string logits_file          = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;

@@ -103,6 +111,11 @@ struct gpt_params {
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter

+    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+
+    int32_t control_vector_layer_start = -1; // layer range for control vector
+    int32_t control_vector_layer_end   = -1; // layer range for control vector
+
    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
@@ -130,7 +143,7 @@ struct gpt_params {
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
+    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
@@ -158,6 +171,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

+bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+
 std::string get_system_info(const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);
@@ -183,6 +198,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
+struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
+
 // Batch utils

 void llama_batch_clear(struct llama_batch & batch);
@@ -269,3 +287,31 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
 void llama_embd_normalize(const float * inp, float * out, int n);

 float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+
+//
+// Control vector utils
+//
+
+struct llama_control_vector_data {
+    int n_embd;
+
+    // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
+    std::vector<float> data;
+};
+
+struct llama_control_vector_load_info {
+    float strength;
+
+    std::string fname;
+};
+
+// Load control vectors, scale each by strength, and add them together.
+// On error, returns {-1, empty}
+llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
+
+//
+// Split utils
+//
+static const char * const LLM_KV_SPLIT_NO            = "split.no";
+static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -0,0 +1,721 @@
+#include "json-schema-to-grammar.h"
+#include <algorithm>
+#include <fstream>
+#include <map>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using json = nlohmann::ordered_json;
+
+const std::string SPACE_RULE = "\" \"?";
+
+std::unordered_map<std::string, std::string> PRIMITIVE_RULES = {
+    {"boolean", "(\"true\" | \"false\") space"},
+    {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"},
+    {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"},
+    {"value", "object | array | string | number | boolean"},
+    {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"},
+    {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"},
+    {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
+                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"},
+    {"string", " \"\\\"\" (\n"
+               "        [^\"\\\\] |\n"
+               "        \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n"
+               "      )* \"\\\"\" space"},
+    {"null", "\"null\" space"}
+};
+std::vector<std::string> OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"};
+
+std::unordered_map<std::string, std::string> DATE_RULES = {
+    {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"},
+    {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"},
+    {"date-time", "date \"T\" time"},
+    {"date-string", "\"\\\"\" date \"\\\"\" space"},
+    {"time-string", "\"\\\"\" time \"\\\"\" space"},
+    {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"}
+};
+
+static bool is_reserved_name(const std::string & name) {
+    static std::unordered_set<std::string> RESERVED_NAMES;
+    if (RESERVED_NAMES.empty()) {
+        RESERVED_NAMES.insert("root");
+        for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
+        for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first);
+    }
+    return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
+}
+
+std::regex INVALID_RULE_CHARS_RE("[^a-zA-Z0-9-]+");
+std::regex GRAMMAR_LITERAL_ESCAPE_RE("[\r\n\"]");
+std::regex GRAMMAR_RANGE_LITERAL_ESCAPE_RE("[\r\n\"\\]\\-\\\\]");
+std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
+    {'\r', "\\r"}, {'\n', "\\n"}, {'"', "\\\""}, {'-', "\\-"}, {']', "\\]"}
+};
+
+std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
+std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
+
+template <typename Iterator>
+std::string join(Iterator begin, Iterator end, const std::string & separator) {
+    std::ostringstream result;
+    if (begin != end) {
+        result << *begin;
+        for (Iterator it = begin + 1; it != end; ++it) {
+            result << separator << *it;
+        }
+    }
+    return result.str();
+}
+
+static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
+    std::vector<std::string> tokens;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    while (end != std::string::npos) {
+        tokens.push_back(str.substr(start, end - start));
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    tokens.push_back(str.substr(start));
+
+    return tokens;
+}
+
+static std::string repeat(const std::string & str, size_t n) {
+    if (n == 0) {
+        return "";
+    }
+
+    std::string result;
+    result.reserve(str.length() * n);
+
+    for (size_t i = 0; i < n; ++i) {
+        result += str;
+    }
+
+    return result;
+}
+
+static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch  &)> & replacement) {
+    std::smatch match;
+    std::string result;
+
+    std::string::const_iterator searchStart(input.cbegin());
+    std::string::const_iterator searchEnd(input.cend());
+
+    while (std::regex_search(searchStart, searchEnd, match, regex)) {
+        result.append(searchStart, searchStart + match.position());
+        result.append(replacement(match));
+        searchStart = match.suffix().first;
+    }
+
+    result.append(searchStart, searchEnd);
+
+    return result;
+}
+
+static std::string format_literal(const std::string & literal) {
+    std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) {
+        char c = match.str()[0];
+        return GRAMMAR_LITERAL_ESCAPES.at(c);
+    });
+    return "\"" + escaped + "\"";
+}
+
+
+class SchemaConverter {
+private:
+    std::function<json(const std::string &)> _fetch_json;
+    bool _dotall;
+    std::map<std::string, std::string> _rules;
+    std::unordered_map<std::string, json> _refs;
+    std::unordered_set<std::string> _refs_being_resolved;
+    std::vector<std::string> _errors;
+    std::vector<std::string> _warnings;
+
+    std::string _add_rule(const std::string & name, const std::string & rule) {
+        std::string esc_name = regex_replace(name, INVALID_RULE_CHARS_RE, "-");
+        if (_rules.find(esc_name) == _rules.end() || _rules[esc_name] == rule) {
+            _rules[esc_name] = rule;
+            return esc_name;
+        } else {
+            int i = 0;
+            while (_rules.find(esc_name + std::to_string(i)) != _rules.end() && _rules[esc_name + std::to_string(i)] != rule) {
+                i++;
+            }
+            std::string key = esc_name + std::to_string(i);
+            _rules[key] = rule;
+            return key;
+        }
+    }
+
+    std::string _generate_union_rule(const std::string & name, const std::vector<json> & alt_schemas) {
+        std::vector<std::string> rules;
+        for (size_t i = 0; i < alt_schemas.size(); i++) {
+            rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
+        }
+        return join(rules.begin(), rules.end(), " | ");
+    }
+
+    std::string _visit_pattern(const std::string & pattern, const std::string & name) {
+        if (!(pattern.front() == '^' && pattern.back() == '$')) {
+            _errors.push_back("Pattern must start with '^' and end with '$'");
+            return "";
+        }
+        std::string sub_pattern = pattern.substr(1, pattern.length() - 2);
+        std::unordered_map<std::string, std::string> sub_rule_ids;
+
+        size_t i = 0;
+        size_t length = sub_pattern.length();
+
+        using literal_or_rule = std::pair<std::string, bool>;
+        auto to_rule = [&](const literal_or_rule & ls) {
+            auto is_literal = ls.second;
+            auto s = ls.first;
+            return is_literal ? "\"" + s + "\"" : s;
+        };
+        std::function<literal_or_rule()> transform = [&]() -> literal_or_rule {
+            size_t start = i;
+            std::vector<literal_or_rule> seq;
+
+            auto get_dot = [&]() {
+                std::string rule;
+                if (_dotall) {
+                    rule = "[\\U00000000-\\U0010FFFF]";
+                } else {
+                    rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]";
+                }
+                return _add_rule("dot", rule);
+            };
+
+            // Joins the sequence, merging consecutive literals together.
+            auto join_seq = [&]() {
+                std::vector<literal_or_rule> ret;
+
+                std::string literal;
+                auto flush_literal = [&]() {
+                    if (literal.empty()) {
+                        return false;
+                    }
+                    ret.push_back(std::make_pair(literal, true));
+                    literal.clear();
+                    return true;
+                };
+
+                for (const auto & item : seq) {
+                    auto is_literal = item.second;
+                    if (is_literal) {
+                        literal += item.first;
+                    } else {
+                        flush_literal();
+                        ret.push_back(item);
+                    }
+                }
+                flush_literal();
+
+                std::vector<std::string> results;
+                for (const auto & item : ret) {
+                    results.push_back(to_rule(item));
+                }
+                return std::make_pair(join(results.begin(), results.end(), " "), false);
+            };
+
+            while (i < length) {
+                char c = sub_pattern[i];
+                if (c == '.') {
+                    seq.push_back(std::make_pair(get_dot(), false));
+                    i++;
+                } else if (c == '(') {
+                    i++;
+                    if (i < length) {
+                        if (sub_pattern[i] == '?') {
+                            _warnings.push_back("Unsupported pattern syntax");
+                        }
+                    }
+                    seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
+                } else if (c == ')') {
+                    i++;
+                    if (start > 0 && sub_pattern[start - 1] != '(') {
+                        _errors.push_back("Unbalanced parentheses");
+                    }
+                    return join_seq();
+                } else if (c == '[') {
+                    std::string square_brackets = std::string(1, c);
+                    i++;
+                    while (i < length && sub_pattern[i] != ']') {
+                        if (sub_pattern[i] == '\\') {
+                            square_brackets += sub_pattern.substr(i, 2);
+                            i += 2;
+                        } else {
+                            square_brackets += sub_pattern[i];
+                            i++;
+                        }
+                    }
+                    if (i >= length) {
+                        _errors.push_back("Unbalanced square brackets");
+                    }
+                    square_brackets += ']';
+                    i++;
+                    seq.push_back(std::make_pair(square_brackets, false));
+                } else if (c == '|') {
+                    seq.push_back(std::make_pair("|", false));
+                    i++;
+                } else if (c == '*' || c == '+' || c == '?') {
+                    seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
+                    i++;
+                } else if (c == '{') {
+                    std::string curly_brackets = std::string(1, c);
+                    i++;
+                    while (i < length && sub_pattern[i] != '}') {
+                        curly_brackets += sub_pattern[i];
+                        i++;
+                    }
+                    if (i >= length) {
+                        _errors.push_back("Unbalanced curly brackets");
+                    }
+                    curly_brackets += '}';
+                    i++;
+                    auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
+                    int min_times = 0;
+                    int max_times = std::numeric_limits<int>::max();
+                    try {
+                        if (nums.size() == 1) {
+                            min_times = max_times = std::stoi(nums[0]);
+                        } else if (nums.size() != 2) {
+                            _errors.push_back("Wrong number of values in curly brackets");
+                        } else {
+                            if (!nums[0].empty()) {
+                                min_times = std::stoi(nums[0]);
+                            }
+                            if (!nums[1].empty()) {
+                                max_times = std::stoi(nums[1]);
+                            }
+                        }
+                    } catch (const std::invalid_argument & e) {
+                        _errors.push_back("Invalid number in curly brackets");
+                        return std::make_pair("", false);
+                    }
+                    auto &last = seq.back();
+                    auto &sub = last.first;
+                    auto sub_is_literal = last.second;
+
+                    if (min_times == 0 && max_times == std::numeric_limits<int>::max()) {
+                        sub += "*";
+                    } else if (min_times == 0 && max_times == 1) {
+                        sub += "?";
+                    } else if (min_times == 1 && max_times == std::numeric_limits<int>::max()) {
+                        sub += "+";
+                    } else {
+                        if (!sub_is_literal) {
+                            std::string & sub_id = sub_rule_ids[sub];
+                            if (sub_id.empty()) {
+                                sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub);
+                            }
+                            sub = sub_id;
+                        }
+                        std::string result;
+                        if (sub_is_literal && min_times > 0) {
+                            result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\"";
+                        } else {
+                            for (int j = 0; j < min_times; j++) {
+                                if (j > 0) {
+                                    result += " ";
+                                }
+                                result += sub;
+                            }
+                        }
+                        if (min_times > 0 && min_times < max_times) {
+                            result += " ";
+                        }
+                        if (max_times == std::numeric_limits<int>::max()) {
+                            result += sub + "*";
+                        } else {
+                            for (int j = min_times; j < max_times; j++) {
+                                if (j > min_times) {
+                                    result += " ";
+                                }
+                                result += sub + "?";
+                            }
+                        }
+                        seq.back().first = result;
+                        seq.back().second = false;
+                    }
+                } else {
+                    std::string literal;
+                    auto is_non_literal = [&](char c) {
+                        return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
+                    };
+                    while (i < length) {
+                        if (sub_pattern[i] == '\\' && i < length - 1) {
+                            char next = sub_pattern[i + 1];
+                            if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.find(next) != ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.end()) {
+                                i++;
+                                literal += sub_pattern[i];
+                                i++;
+                            } else {
+                                literal += sub_pattern.substr(i, 2);
+                                i += 2;
+                            }
+                        } else if (sub_pattern[i] == '"') {
+                            literal += "\\\"";
+                            i++;
+                        } else if (!is_non_literal(sub_pattern[i]) &&
+                                (i == length - 1 || literal.empty() || sub_pattern[i + 1] == '.' || !is_non_literal(sub_pattern[i + 1]))) {
+                            literal += sub_pattern[i];
+                            i++;
+                        } else {
+                            break;
+                        }
+                    }
+                    if (!literal.empty()) {
+                        seq.push_back(std::make_pair(literal, true));
+                    }
+                }
+            }
+            return join_seq();
+        };
+        return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
+    }
+
+    std::string _resolve_ref(const std::string & ref) {
+        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
+        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
+            _refs_being_resolved.insert(ref);
+            json resolved = _refs[ref];
+            ref_name = visit(resolved, ref_name);
+            _refs_being_resolved.erase(ref);
+        }
+        return ref_name;
+    }
+
+    std::string _build_object_rule(
+        const std::vector<std::pair<std::string, json>> & properties,
+        const std::unordered_set<std::string> & required,
+        const std::string & name,
+        const json & additional_properties)
+    {
+        std::vector<std::string> required_props;
+        std::vector<std::string> optional_props;
+        std::unordered_map<std::string, std::string> prop_kv_rule_names;
+        for (const auto & kv : properties) {
+            const auto &prop_name = kv.first;
+            const auto &prop_schema = kv.second;
+
+            std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name);
+            prop_kv_rule_names[prop_name] = _add_rule(
+                name + (name.empty() ? "" : "-") + prop_name + "-kv",
+                format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name
+            );
+            if (required.find(prop_name) != required.end()) {
+                required_props.push_back(prop_name);
+            } else {
+                optional_props.push_back(prop_name);
+            }
+        }
+        if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
+            std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
+            std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
+            std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
+            prop_kv_rule_names["*"] = kv_rule;
+            optional_props.push_back("*");
+        }
+
+        std::string rule = "\"{\" space ";
+        for (size_t i = 0; i < required_props.size(); i++) {
+            if (i > 0) {
+                rule += " \",\" space ";
+            }
+            rule += prop_kv_rule_names[required_props[i]];
+        }
+
+        if (!optional_props.empty()) {
+            rule += " (";
+            if (!required_props.empty()) {
+                rule += " \",\" space ( ";
+            }
+
+            std::function<std::string(const std::vector<std::string> &, bool)> get_recursive_refs = [&](const std::vector<std::string> & ks, bool first_is_optional) {
+                std::string res;
+                if (ks.empty()) {
+                    return res;
+                }
+                std::string k = ks[0];
+                std::string kv_rule_name = prop_kv_rule_names[k];
+                if (k == "*") {
+                    res = _add_rule(
+                        name + (name.empty() ? "" : "-") + "additional-kvs",
+                        kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
+                    );
+                } else if (first_is_optional) {
+                    res = "( \",\" space " + kv_rule_name + " )?";
+                } else {
+                    res = kv_rule_name;
+                }
+                if (ks.size() > 1) {
+                    res += " " + _add_rule(
+                        name + (name.empty() ? "" : "-") + k + "-rest",
+                        get_recursive_refs(std::vector<std::string>(ks.begin() + 1, ks.end()), true)
+                    );
+                }
+                return res;
+            };
+
+            for (size_t i = 0; i < optional_props.size(); i++) {
+                if (i > 0) {
+                    rule += " | ";
+                }
+                rule += get_recursive_refs(std::vector<std::string>(optional_props.begin() + i, optional_props.end()), false);
+            }
+            if (!required_props.empty()) {
+                rule += " )";
+            }
+            rule += " )?";
+        }
+
+        rule += " \"}\" space";
+
+        return rule;
+    }
+
+public:
+    SchemaConverter(
+        const std::function<json(const std::string &)> & fetch_json,
+        bool dotall)
+          : _fetch_json(fetch_json), _dotall(dotall)
+    {
+        _rules["space"] = SPACE_RULE;
+    }
+
+    void resolve_refs(json & schema, const std::string & url) {
+        /*
+        * Resolves all $ref fields in the given schema, fetching any remote schemas,
+        * replacing each $ref with absolute reference URL and populates _refs with the
+        * respective referenced (sub)schema dictionaries.
+        */
+        std::function<void(json &)> visit_refs = [&](json & n) {
+            if (n.is_array()) {
+                for (auto & x : n) {
+                    visit_refs(x);
+                }
+            } else if (n.is_object()) {
+                if (n.contains("$ref")) {
+                    std::string ref = n["$ref"];
+                    if (_refs.find(ref) == _refs.end()) {
+                        json target;
+                        if (ref.find("https://") == 0) {
+                            std::string base_url = ref.substr(0, ref.find('#'));
+                            auto it = _refs.find(base_url);
+                            if (it != _refs.end()) {
+                                target = it->second;
+                            } else {
+                                // Fetch the referenced schema and resolve its refs
+                                auto referenced = _fetch_json(ref);
+                                resolve_refs(referenced, base_url);
+                                _refs[base_url] = referenced;
+                            }
+                            if (ref.find('#') == std::string::npos || ref.substr(ref.find('#') + 1).empty()) {
+                                return;
+                            }
+                        } else if (ref.find("#/") == 0) {
+                            target = schema;
+                            n["$ref"] = url + ref;
+                            ref = url + ref;
+                        } else {
+                            _errors.push_back("Unsupported ref: " + ref);
+                            return;
+                        }
+                        std::string pointer = ref.substr(ref.find('#') + 1);
+                        std::vector<std::string> tokens = split(pointer, "/");
+                        for (size_t i = 1; i < tokens.size(); ++i) {
+                            std::string sel = tokens[i];
+                            if (target.is_null() || !target.contains(sel)) {
+                                _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
+                                return;
+                            }
+                            target = target[sel];
+                        }
+                        _refs[ref] = target;
+                    }
+                } else {
+                    for (auto & kv : n.items()) {
+                        visit_refs(kv.value());
+                    }
+                }
+            }
+        };
+
+        visit_refs(schema);
+    }
+
+    std::string _generate_constant_rule(const json & value) {
+        return format_literal(value.dump());
+    }
+
+    std::string visit(const json & schema, const std::string & name) {
+        json schema_type = schema.contains("type") ? schema["type"] : json();
+        std::string schema_format = schema.contains("format") ? schema["format"].get<std::string>() : "";
+        std::string rule_name = is_reserved_name(name) ? name + "-" : name.empty() ? "root" : name;
+
+        if (schema.contains("$ref")) {
+            return _add_rule(rule_name, _resolve_ref(schema["$ref"]));
+        } else if (schema.contains("oneOf") || schema.contains("anyOf")) {
+            std::vector<json> alt_schemas = schema.contains("oneOf") ? schema["oneOf"].get<std::vector<json>>() : schema["anyOf"].get<std::vector<json>>();
+            return _add_rule(rule_name, _generate_union_rule(name, alt_schemas));
+        } else if (schema_type.is_array()) {
+            std::vector<json> schema_types;
+            for (const auto & t : schema_type) {
+                schema_types.push_back({{"type", t}});
+            }
+            return _add_rule(rule_name, _generate_union_rule(name, schema_types));
+        } else if (schema.contains("const")) {
+            return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
+        } else if (schema.contains("enum")) {
+            std::vector<std::string> enum_values;
+            for (const auto & v : schema["enum"]) {
+                enum_values.push_back(_generate_constant_rule(v));
+            }
+            return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
+        } else if ((schema_type.is_null() || schema_type == "object")
+                && (schema.contains("properties") ||
+                    (schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
+            std::unordered_set<std::string> required;
+            if (schema.contains("required") && schema["required"].is_array()) {
+                for (const auto & item : schema["required"]) {
+                    if (item.is_string()) {
+                        required.insert(item.get<std::string>());
+                    }
+                }
+            }
+            std::vector<std::pair<std::string, json>> properties;
+            if (schema.contains("properties")) {
+                for (const auto & prop : schema["properties"].items()) {
+                    properties.emplace_back(prop.key(), prop.value());
+                }
+            }
+            return _add_rule(rule_name,
+                _build_object_rule(
+                    properties, required, name,
+                    schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
+        } else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
+            std::unordered_set<std::string> required;
+            std::vector<std::pair<std::string, json>> properties;
+            std::string hybrid_name = name;
+            std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
+                if (comp_schema.contains("$ref")) {
+                    add_component(_refs[comp_schema["$ref"]], is_required);
+                } else if (comp_schema.contains("properties")) {
+                    for (const auto & prop : comp_schema["properties"].items()) {
+                        properties.emplace_back(prop.key(), prop.value());
+                        if (is_required) {
+                            required.insert(prop.key());
+                        }
+                    }
+                } else {
+                  // todo warning
+                }
+            };
+            for (auto & t : schema["allOf"]) {
+                if (t.contains("anyOf")) {
+                    for (auto & tt : t["anyOf"]) {
+                        add_component(tt, false);
+                    }
+                } else {
+                    add_component(t, true);
+                }
+            }
+            return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
+        } else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
+            json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];
+            if (items.is_array()) {
+                std::string rule = "\"[\" space ";
+                for (size_t i = 0; i < items.size(); i++) {
+                    if (i > 0) {
+                        rule += " \",\" space ";
+                    }
+                    rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i));
+                }
+                rule += " \"]\" space";
+                return _add_rule(rule_name, rule);
+            } else {
+                std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item");
+                std::string list_item_operator = "( \",\" space " + item_rule_name + " )";
+                std::string successive_items;
+                int min_items = schema.contains("minItems") ? schema["minItems"].get<int>() : 0;
+                json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json();
+                int max_items = max_items_json.is_number_integer() ? max_items_json.get<int>() : -1;
+                if (min_items > 0) {
+                    successive_items += repeat(list_item_operator, min_items - 1);
+                    min_items--;
+                }
+                if (max_items >= 0 && max_items > min_items) {
+                    successive_items += repeat(list_item_operator + "?", max_items - min_items - 1);
+                } else {
+                    successive_items += list_item_operator + "*";
+                }
+                std::string rule;
+                if (min_items == 0) {
+                    rule =  "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space";
+                } else {
+                    rule =  "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space";
+                }
+                return _add_rule(rule_name, rule);
+            }
+        } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) {
+            return _visit_pattern(schema["pattern"], rule_name);
+        } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) {
+            return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid"));
+        } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) {
+            for (const auto & kv : DATE_RULES) {
+                _add_rule(kv.first, kv.second);
+            }
+            return schema_format + "-string";
+        } else if (schema.empty() || schema_type == "object") {
+            for (const auto & n : OBJECT_RULE_NAMES) {
+                _add_rule(n, PRIMITIVE_RULES.at(n));
+            }
+            return _add_rule(rule_name, "object");
+        } else {
+            if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get<std::string>()) == PRIMITIVE_RULES.end()) {
+                _errors.push_back("Unrecognized schema: " + schema.dump());
+                return "";
+            }
+            // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+            return _add_rule(rule_name == "root" ? "root" : schema_type.get<std::string>(), PRIMITIVE_RULES.at(schema_type.get<std::string>()));
+        }
+    }
+
+    void check_errors() {
+        if (!_errors.empty()) {
+            throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
+        }
+        if (!_warnings.empty()) {
+            fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
+        }
+    }
+
+    std::string format_grammar() {
+        std::stringstream ss;
+        for (const auto & kv : _rules) {
+            ss << kv.first << " ::= " << kv.second << std::endl;
+        }
+        return ss.str();
+    }
+};
+
+std::string json_schema_to_grammar(const json & schema) {
+    SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
+    auto copy = schema;
+    converter.resolve_refs(copy, "input");
+    converter.visit(copy, "");
+    converter.check_errors();
+    return converter.format_grammar();
+}
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -0,0 +1,4 @@
+#pragma once
+#include "json.hpp"
+
+std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/common/log.h
+++ b/common/log.h
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
    #define LOG_IMPL(str, ...)                                                                                      \
    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#ifndef _MSC_VER
+#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
@@ -566,6 +566,7 @@ inline void log_print_usage()
    printf("  --log-new             Create a separate new log file on start. "
                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("\n");
 }

 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -0,0 +1,282 @@
+#include "ngram-cache.h"
+#include "common.h"
+#include "log.h"
+
+#include <cstdint>
+#include <fstream>
+
+void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
+                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
+    const int64_t t_start_ms = ggml_time_ms();
+    const int64_t inp_size = inp.size();
+
+    const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
+    int64_t n_done = 0;
+
+    for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
+        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
+        for (int64_t i = i_start; i < inp_size; ++i) {
+            const int64_t ngram_start = i - ngram_size;
+            llama_ngram ngram(&inp[ngram_start], ngram_size);
+            const llama_token token = inp[i];
+
+            llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
+            if (part_it == ngram_cache.end()) {
+                llama_ngram_cache_part part;
+                part.emplace(token, 1);
+                ngram_cache.emplace(ngram, part);
+            } else {
+                llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
+                if (token_count_it == part_it->second.end()) {
+                    part_it->second.emplace(token, 1);
+                } else {
+                    token_count_it->second++;
+                }
+            }
+            ++n_done;
+
+            if (print_progress && n_done % 10000000 == 0) {
+                const int64_t t_now_ms = ggml_time_ms();
+                const int64_t eta_ms   = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
+                const int64_t eta_min  = eta_ms / (60*1000);
+                const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
+
+                fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
+            }
+        }
+    }
+}
+
+// Helper function to get a token from the combined, speculative sequence of inp and draft.
+static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
+    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
+}
+
+// If sample size or percentage are below these thresholds the draft is aborted early:
+constexpr int    draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2,  2,  1,  1};
+constexpr int        draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
+constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4,  3,  2,  2};
+constexpr int     draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
+
+// Helper function that tries to draft a token from only the static ngram cache:
+static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
+    llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+    if (part_static_it == nc_static.end()) {
+        return -1;
+    }
+    const llama_ngram_cache_part part_static = part_static_it->second;
+
+    int max_count_static  = 0;
+    int sum_count_static  = 0;
+    llama_token max_token = -1;
+
+    for (std::pair<llama_token, int> token_count_static : part_static) {
+        const llama_token token = token_count_static.first;
+        const int32_t count_static  = token_count_static.second;
+
+        if (count_static > max_count_static) {
+            max_token        = token;
+            max_count_static = count_static;
+        }
+        sum_count_static += count_static;
+    }
+
+    if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
+        return -1;
+    }
+    if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
+        return -1;
+    }
+    return max_token;
+}
+
+// Try to draft a token from primary cache (context/dynamic), validate with static cache:
+static llama_token try_draft(
+    llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
+    const int * min_sample_size, const int * min_percent) {
+
+    llama_token drafted_token = -1;
+
+    for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
+        const llama_ngram ngram_primary = ngrams_primary[i];
+
+        llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
+        if (part_primary_it == nc_primary.end()) {
+            continue;
+        }
+        const llama_ngram_cache_part part_primary = part_primary_it->second;
+
+        int max_count_primary = 0;
+        int max_count_static  = 0;
+        int sum_count_primary = 0;
+        llama_token max_token = -1;
+
+        for (std::pair<llama_token, int> token_count_primary : part_primary) {
+            const llama_token token = token_count_primary.first;
+
+            llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
+
+            const int32_t count_primary = token_count_primary.second;
+            const int32_t count_static  = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
+
+            if (count_primary*count_static > max_count_primary*max_count_static) {
+                max_token         = token;
+                max_count_primary = count_primary;
+                max_count_static  = count_static;
+            }
+            sum_count_primary += count_primary;
+        }
+
+        if (sum_count_primary < min_sample_size[i]) {
+            continue;
+        }
+        if (100*max_count_primary < min_percent[i]*sum_count_primary) {
+            continue;;
+        }
+        drafted_token = max_token;
+    }
+
+    return drafted_token;
+}
+
+void llama_ngram_cache_draft(
+    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
+) {
+    GGML_ASSERT(draft.size() == 1);
+    const int inp_size = inp.size();
+
+    if (inp_size < LLAMA_NGRAM_STATIC) {
+        return;
+    }
+
+    while ((int) draft.size()-1 < n_draft) {
+        llama_token drafted_token = -1;
+
+        const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
+        llama_ngram ngram_static;
+        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
+            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
+        }
+        llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
+        llama_ngram_cache_part part_static;
+        if (part_static_it != nc_static.end()) {
+            part_static = part_static_it->second;
+        }
+
+        // cd = context + dynamic
+        std::vector<llama_ngram> ngrams_cd;
+        for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
+            const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
+            llama_ngram ngram_cd;
+            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
+                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
+            }
+            ngrams_cd.push_back(ngram_cd);
+        }
+        if (drafted_token == -1) {
+            drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
+        }
+        if (drafted_token == -1) {
+            drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
+        }
+        if (drafted_token == -1) {
+            drafted_token = try_draft(nc_static, ngram_static);
+        }
+
+        if (drafted_token == -1) {
+            break;
+        }
+
+        LOG(" - draft candidate: token=%d\n", drafted_token);
+        draft.push_back(drafted_token);
+    }
+}
+
+void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
+    std::ofstream file_out(filename, std::ios::binary);
+    for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
+        const llama_ngram      ngram        = item.first;
+        llama_ngram_cache_part token_counts = item.second;
+        GGML_ASSERT(!token_counts.empty());
+        const int32_t ntokens = token_counts.size();
+        GGML_ASSERT(ntokens > 0);
+
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(llama_ngram));
+        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
+        for (std::pair<llama_token, int32_t> item2 : token_counts) {
+            const llama_token token = item2.first;
+            const int32_t     count = item2.second;
+            GGML_ASSERT(count > 0);
+
+            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
+            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
+        }
+    }
+
+}
+
+llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+    std::ifstream hashmap_file(filename, std::ios::binary);
+    if (!hashmap_file) {
+        throw std::ifstream::failure("Unable to open file " + filename);
+    }
+    llama_ngram_cache ngram_cache;
+
+    llama_ngram ngram;
+    int32_t     ntokens;
+    llama_token token;
+    int32_t     count;
+
+    char * ngramc   = reinterpret_cast<char*>(&ngram);
+    char * ntokensc = reinterpret_cast<char*>(&ntokens);
+    char * tokenc   = reinterpret_cast<char*>(&token);
+    char * countc   = reinterpret_cast<char*>(&count);
+    while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
+        GGML_ASSERT(!hashmap_file.eof());
+        GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
+        GGML_ASSERT(ntokens > 0);
+        llama_ngram_cache_part token_counts;
+
+        for (int i = 0; i < ntokens; ++i) {
+            GGML_ASSERT(!hashmap_file.eof());
+            GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
+            GGML_ASSERT(!hashmap_file.eof());
+            GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
+            GGML_ASSERT(count > 0);
+            token_counts.emplace(token, count);
+        }
+
+        ngram_cache.emplace(ngram, token_counts);
+    }
+    GGML_ASSERT(hashmap_file.eof());
+
+    return ngram_cache;
+}
+
+void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
+    for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
+        const llama_ngram      ngram = ngram_part.first;
+        llama_ngram_cache_part  part = ngram_part.second;
+
+        llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
+        if (part_merged_it == ngram_cache_target.end()) {
+            ngram_cache_target.emplace(ngram, part);
+            continue;
+        }
+
+        for (std::pair<llama_token, int32_t> token_count : part) {
+            const llama_token token = token_count.first;
+            const int32_t     count = token_count.second;
+            GGML_ASSERT(count > 0);
+
+            llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
+            if (token_count_merged_it == part_merged_it->second.end()) {
+                part_merged_it->second.emplace(token, count);
+                continue;
+            }
+
+            token_count_merged_it->second += count;
+        }
+    }
+}
--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include "llama.h"
+
+#include <unordered_map>
+#include <string>
+#include <vector>
+
+#define LLAMA_NGRAM_MIN    1
+#define LLAMA_NGRAM_MAX    4
+#define LLAMA_NGRAM_STATIC 2
+
+// Data structures to map n-grams to empirical token probabilities:
+
+struct llama_ngram {
+    llama_token tokens[LLAMA_NGRAM_MAX];
+
+    llama_ngram() {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = -1;
+        }
+    }
+
+    llama_ngram(const llama_token * input, const int ngram_size) {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = i < ngram_size ? input[i] : -1;
+        }
+    }
+
+    bool operator==(const llama_ngram & other) const {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            if (tokens[i] != other.tokens[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
+struct llama_ngram_hash_function {
+    size_t operator()(const llama_ngram & ngram) const {
+        size_t hash = 0;
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+        }
+        return hash;
+    }
+};
+
+// token -> number of times token has been seen
+typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
+
+// n-gram -> empirical distribution of following tokens
+typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
+
+
+// Update an ngram cache with tokens.
+// ngram_cache:         the cache to modify.
+// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
+// inp_data:            the token sequence with which to update ngram_cache.
+// nnew:                how many new tokens have been appended to inp_data since the last call to this function.
+// print_progress:      whether to print progress to stderr.
+//
+// In order to get correct results inp_data can ONLY BE APPENDED TO.
+// Changes in the middle need a complete rebuild.
+void llama_ngram_cache_update(
+    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+
+// Try to draft tokens from ngram caches.
+// inp:                the tokens generated so far.
+// draft:              the token sequence to draft. Expected to initially contain the previously sampled token.
+// n_draft:            maximum number of tokens to add to draft.
+// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
+// nc_context:         ngram cache based on current context.
+// nc_dynamic:         ngram cache based on previous user generations.
+// nc_static:          ngram cache generated from a large text corpus, used for validation.
+void llama_ngram_cache_draft(
+    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
+
+// Save an ngram cache to a file.
+// ngram_cache: the ngram cache to save.
+// filename:    the path under which to save the ngram cache.
+void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
+
+// Load an ngram cache saved with llama_ngram_cache_save.
+// filename: the path from which to load the ngram cache.
+// returns:  an ngram cache containing the information saved to filename.
+llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+
+// Merge two ngram caches.
+// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
+// ngram_cache_add:    the ngram cache to add to ngram_cache_target.
+void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -168,77 +168,20 @@ static llama_token llama_sampling_sample_impl(
                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
    const llama_sampling_params & params = ctx_sampling->params;

-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
    const float   temp            = params.temp;
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
    const int     mirostat        = params.mirostat;
    const float   mirostat_tau    = params.mirostat_tau;
    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;

+    std::vector<float> original_logits;
+    auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits);
+    if (!is_resampling) {
+        GGML_ASSERT(!original_logits.empty());
+    }
    llama_token id = 0;
-
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    if (!is_resampling) {
-        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
-        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
-    }
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.clear();
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    // If we are in the resampling phase, apply grammar checks before sampling logic
-    if (is_resampling && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
-
    if (temp < 0.0) {
        // greedy sampling, with probs
        llama_sample_softmax(ctx_main, &cur_p);
@@ -302,11 +245,13 @@ static llama_token llama_sampling_sample_impl(
    return id;
 }

-static llama_token_data_array llama_sample_probability_distribution_impl(
+static llama_token_data_array llama_sampling_prepare_impl(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx) {
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
@@ -315,6 +260,7 @@ static llama_token_data_array llama_sample_probability_distribution_impl(
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
    const float   penalty_present = params.penalty_present;
+
    const bool    penalize_nl     = params.penalize_nl;

    auto & prev = ctx_sampling->prev;
@@ -323,8 +269,10 @@ static llama_token_data_array llama_sample_probability_distribution_impl(
    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
+    if (apply_grammar && original_logits != NULL) {
+        // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
+        *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
+    }

    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
@@ -364,12 +312,11 @@ static llama_token_data_array llama_sample_probability_distribution_impl(
        }
    }

-    // apply grammar checks
-    if (ctx_sampling->grammar != NULL) {
+    // apply grammar checks before sampling logic
+    if (apply_grammar && ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

-    llama_sample_softmax(ctx_main, &cur_p);
    return cur_p;
 }

@@ -382,12 +329,14 @@ llama_token llama_sampling_sample(
    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
 }

-llama_token_data_array llama_sampling_probability_distribution(
+llama_token_data_array llama_sampling_prepare(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx);
+                  const int idx,
+                  bool apply_grammar,
+                  std::vector<float> * original_logits) {
+    return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits);
 }

 void llama_sampling_accept(
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
+    float       penalty_repeat        = 1.00f;    // 1.0 = disabled
    float       penalty_freq          = 0.00f;    // 0.0 = disabled
    float       penalty_present       = 0.00f;    // 0.0 = disabled
    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float       mirostat_tau          = 5.00f;    // target entropy
    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = true;     // consider newlines as a repeatable token
+    bool        penalize_nl           = false;     // consider newlines as a repeatable token

    std::vector<llama_sampler_type> samplers_sequence = {
        llama_sampler_type::TOP_K,
@@ -131,12 +131,14 @@ llama_token llama_sampling_sample(
        struct llama_context * ctx_cfg,
        int idx = 0);

-// returns the probability that token of given id will be sampled
-llama_token_data_array llama_sampling_probability_distribution(
+// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters.
+llama_token_data_array llama_sampling_prepare(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
        struct llama_context * ctx_cfg,
-        int idx = 0);
+        int idx = 0,
+        bool apply_grammar = true,
+        std::vector<float> * original_logits = nullptr);

 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -93,31 +93,42 @@ class Model(ABC):

        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
            self.gguf_writer.add_context_length(n_ctx)
+            print(f"gguf: context length = {n_ctx}")

        n_embd = self.find_hparam(["hidden_size", "n_embd"])
        self.gguf_writer.add_embedding_length(n_embd)
+        print(f"gguf: embedding length = {n_embd}")

        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
+            print(f"gguf: feed forward length = {n_ff}")

        n_head = self.find_hparam(["num_attention_heads", "n_head"])
        self.gguf_writer.add_head_count(n_head)
+        print(f"gguf: head count = {n_head}")

        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
            self.gguf_writer.add_head_count_kv(n_head_kv)
+            print(f"gguf: key-value head count = {n_head_kv}")

        if (rope_theta := self.hparams.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
+            print(f"gguf: rope theta = {rope_theta}")
        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
+            print(f"gguf: rms norm epsilon = {f_rms_eps}")
        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
+            print(f"gguf: layer norm epsilon = {f_norm_eps}")
        if (n_experts := self.hparams.get("num_local_experts")) is not None:
            self.gguf_writer.add_expert_count(n_experts)
+            print(f"gguf: expert count = {n_experts}")
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
+            print(f"gguf: experts used count = {n_experts_used}")

        self.gguf_writer.add_file_type(self.ftype)
+        print(f"gguf: file type = {self.ftype}")

    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@@ -320,7 +331,7 @@ class Model(ABC):
        tokenizer = SentencePieceProcessor(str(tokenizer_path))
        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())

-        for token_id in range(vocab_size):
+        for token_id in range(tokenizer.vocab_size()):
            piece = tokenizer.id_to_piece(token_id)
            text = piece.encode("utf-8")
            score = tokenizer.get_score(token_id)
@@ -345,9 +356,13 @@ class Model(ABC):
                added_tokens_json = json.load(f)

                for key in added_tokens_json:
-                    tokens.append(key.encode("utf-8"))
-                    scores.append(-1000.0)
-                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+                    key = key.encode("utf-8")
+                    if key not in tokens:
+                        tokens.append(key)
+                        scores.append(-1000.0)
+                        toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+
+        assert len(tokens) == vocab_size

        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_token_list(tokens)
@@ -1051,6 +1066,21 @@ class MixtralModel(Model):
        self._set_vocab_sentencepiece()


+@Model.register("GrokForCausalLM")
+class GrokModel(Model):
+    model_arch = gguf.MODEL_ARCH.GROK
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_name("Grok")
+
+
@Model.register("MiniCPMForCausalLM")
 class MiniCPMModel(Model):
    model_arch = gguf.MODEL_ARCH.MINICPM
@@ -1634,7 +1664,7 @@ in chat mode so that the conversation can end normally.")
                self.post_write_tensors(tensor_map, name, data_torch)


-@Model.register("BertModel")
+@Model.register("BertModel", "CamembertModel")
 class BertModel(Model):
    model_arch = gguf.MODEL_ARCH.BERT

@@ -1965,6 +1995,23 @@ class MambaModel(Model):
            self.gguf_writer.add_tensor(new_name, data)


+@Model.register("CohereForCausalLM")
+class CommandR2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COMMAND_R
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # max_position_embeddings = 8192 in config.json but model was actually
+        # trained on 128k context length
+        self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
 ###### CONVERSION LOGIC ######


--- a/convert.py
+++ b/convert.py
@@ -1167,9 +1167,9 @@ class OutputFile:
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type

-    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
        return GGMLFileType.AllF32
-    if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
+    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
        return GGMLFileType.MostlyF16
    if output_type_str == "q8_0":
        return GGMLFileType.MostlyQ8_0
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -1,7 +1,7 @@
 # Token generation performance troubleshooting

-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+## Verifying that the model is running on the GPU with CUDA
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
 ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,6 +21,7 @@ else()
    add_subdirectory(embedding)
    add_subdirectory(finetune)
    add_subdirectory(gritlm)
+    add_subdirectory(gguf-split)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
@@ -33,6 +34,7 @@ else()
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
+    add_subdirectory(retrieval)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
    add_subdirectory(passkey)
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -48,6 +48,8 @@ int main(int argc, char ** argv) {
        params.prompt = "Hello my name is";
    }

+    process_escapes(params.prompt);
+
    // init LLM

    llama_backend_init();
@@ -78,7 +80,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = n_kv_req;
+    ctx_params.n_ctx   = n_kv_req;
    ctx_params.n_batch = std::max(n_len, n_parallel);
    ctx_params.n_seq_max       = n_parallel;
    ctx_params.n_threads       = params.n_threads;
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface.

 `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`

+Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K).
+
 Now you can use the model with a command like:

 `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
+#include "log.h"

 #include <unordered_map>
 #include <vector>
@@ -78,111 +79,101 @@ typedef struct {

 struct TransformerWeights {
    // token embedding table
-    float* token_embedding_table;    // (vocab_size, dim)
+    std::vector<float> token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
-    float* rms_att_weight; // (layer, dim) rmsnorm weights
-    float* rms_ffn_weight; // (layer, dim)
+    std::vector<float> rms_att_weight; // (layer, dim) rmsnorm weights
+    std::vector<float> rms_ffn_weight; // (layer, dim)
    // weights for matmuls
-    float* wq; // (layer, dim, dim)
-    float* wk; // (layer, dim, dim)
-    float* wv; // (layer, dim, dim)
-    float* wo; // (layer, dim, dim)
+    std::vector<float> wq; // (layer, dim, dim)
+    std::vector<float> wk; // (layer, dim, dim)
+    std::vector<float> wv; // (layer, dim, dim)
+    std::vector<float> wo; // (layer, dim, dim)
    // weights for ffn
-    float* w1; // (layer, hidden_dim, dim)
-    float* w2; // (layer, dim, hidden_dim)
-    float* w3; // (layer, hidden_dim, dim)
+    std::vector<float> w1; // (layer, hidden_dim, dim)
+    std::vector<float> w2; // (layer, dim, hidden_dim)
+    std::vector<float> w3; // (layer, hidden_dim, dim)
    // final rmsnorm
-    float* rms_final_weight; // (dim,)
+    std::vector<float> rms_final_weight; // (dim,)
    // freq_cis for RoPE relatively positional embeddings
-    // float* freq_cis_real; // (seq_len, dim/2)
-    // float* freq_cis_imag; // (seq_len, dim/2)
+    // std::vector<float> freq_cis_real; // (seq_len, dim/2)
+    // std::vector<float> freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
-    float* wcls;
-
-    ~TransformerWeights() {
-        delete[] token_embedding_table;
-        delete[] rms_att_weight;
-        delete[] rms_ffn_weight;
-        delete[] wq;
-        delete[] wk;
-        delete[] wv;
-        delete[] wo;
-        delete[] w1;
-        delete[] w2;
-        delete[] w3;
-        delete[] rms_final_weight;
-        delete[] wcls;
-    }
+    std::vector<float> wcls;
 };

-static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
-    // we calloc instead of malloc to keep valgrind happy
-    w->token_embedding_table = new float[p->vocab_size * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) {
+    const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
+    try {
+        w->token_embedding_table.resize(p->vocab_size * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);

-    w->rms_att_weight = new float[p->n_layers * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        w->rms_att_weight.resize(p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);

-    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        w->rms_ffn_weight.resize(p->n_layers * p->dim);
+        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);

-    w->wq = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wq.resize(p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-    w->wk = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

-    w->wv = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);

-    w->wo = new float[p->n_layers * p->dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        w->wo.resize(p->n_layers * p->dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);

-    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);

-    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
-    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
+        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);

-    w->rms_final_weight = new float[p->dim]();
-    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        w->rms_final_weight.resize(p->dim);
+        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);

-    if (shared_weights) {
-        w->wcls = NULL;
-    } else {
-        w->wcls = new float[p->vocab_size * p->dim]();
-        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        if (shared_weights) {
+            w->wcls = {};
+        } else {
+            w->wcls.resize(p->vocab_size * p->dim);
+            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        }
+    }
+    catch (std::length_error &) {
+        die("Invalid configuration. Failed to allocate memory for weights");
    }
 }

-static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
-    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
-    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
-    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
-    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
-    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
-    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) {
+    if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1;
+    if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1;
+    if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1;
+    if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1;
+    if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1;
+    if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1;
+    if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1;
+    if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1;
+    if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1;
+    if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1;
+    if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1;

    // Skip freq_cis_real & freq_cis_imag
    int head_size = p->dim / p->n_heads;
    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);

-    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1;

    // Check we didn't forget to read anything
    auto curr = ftell(f);
    fseek(f, 0, SEEK_END);
    auto end = ftell(f);
    if (curr != end) {
-        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
+        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
        return 1;
    }

@@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
 }

 static void print_sample_weights(TransformerWeights *w){
-    printf("----- Quick print of first of the weight vales of all the variables\n");
-    printf("%f\n", w->token_embedding_table[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    printf("%f\n", w->rms_ffn_weight[0]);
+    LOG("----- Quick print of first of the weight vales of all the variables\n");
+    LOG("%f\n", w->token_embedding_table[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    LOG("%f\n", w->rms_ffn_weight[0]);

-    printf("%f\n", w->wq[0]);
-    printf("%f\n", w->wk[0]);
-    printf("%f\n", w->wv[0]);
-    printf("%f\n", w->wo[0]);
-    printf("%f\n", w->w1[0]);
-    printf("%f\n", w->w2[0]);
-    printf("%f\n", w->w3[0]);
-    printf("%f\n", w->rms_att_weight[0]);
-    if (w->wcls) printf("%f\n", w->wcls[0]);
+    LOG("%f\n", w->wq[0]);
+    LOG("%f\n", w->wk[0]);
+    LOG("%f\n", w->wv[0]);
+    LOG("%f\n", w->wo[0]);
+    LOG("%f\n", w->w1[0]);
+    LOG("%f\n", w->w2[0]);
+    LOG("%f\n", w->w3[0]);
+    LOG("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////

@@ -225,14 +216,16 @@ struct llama_vocab {
 };

 struct my_llama_hparams {
-    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
-    uint32_t n_embd  = 4096;
-    uint32_t n_ff    = 11008;
-    uint32_t n_mult  = 4;
-    uint32_t n_head  = 32;
-    uint32_t n_layer = 32;
-    uint32_t n_rot   = 64;
+    uint32_t n_vocab   = 32000;
+    uint32_t n_ctx     = 512;   // this is provided as user input?
+    uint32_t n_embd    = 4096;
+    uint32_t n_ff      = 11008;
+    uint32_t n_mult    = 4;
+    uint32_t n_head    = 32;
+    uint32_t n_head_kv = 32;
+    uint32_t n_layer   = 32;
+    uint32_t n_rot     = 64;
+
    bool operator!=(const my_llama_hparams& other) const {
        return memcmp(this, &other, sizeof(my_llama_hparams));
    }
@@ -325,14 +318,30 @@ struct train_params {
 };

 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
-    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
-    printf("%s: n_head:  %u\n", __func__, params->n_head);
-    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
-    printf("%s: n_layer: %u\n", __func__, params->n_layer);
-    printf("%s: n_rot:   %u\n", __func__, params->n_rot);
+    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
+}
+
+static void print_tensor_info(const struct ggml_context * ctx) {
+    for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+        LOG("%s: Allocating ", __func__);
+        int64_t total = 1;
+        int i = 0;
+        for (; i < ggml_n_dims(t); ++i) {
+            if (i > 0) LOG("x ");
+            LOG("[%" PRId64 "] ", t->ne[i]);
+            total *= t->ne[i];
+        }
+        if (i > 1) LOG("= [%" PRId64 "] ", total);
+        LOG("float space for %s\n", ggml_get_name(t));
+    }
 }

 static void init_model(struct my_llama_model * model) {
@@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) {
    const uint32_t n_layer = hparams.n_layer;
    const uint32_t n_vocab = hparams.n_vocab;

+    const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv;
+
    const uint32_t n_ff = hparams.n_ff;
    struct ggml_context * ctx = model->ctx;

@@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) {
    model->train_tokens = 0;

    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
-
    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-    printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd);
-
    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
-
-    // printing the per-layer allocations here so we dont print in the for loop.
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer);
-
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
-    printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);

    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
    ggml_set_name(model->norm,           "norm.weight");
@@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) {
        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
-        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries);
        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);

        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
@@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) {
        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
    }
+
+    print_tensor_info(ctx);
 }

 static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
@@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
 static void print_row(struct ggml_tensor * probs, int i) {
    for (int k = 0; k < probs->ne[0]; ++k) {
        float p = get_f32_2d(probs, k, i);
-        printf(" %f", p);
+        LOG(" %f", p);
    }
-    printf("\n");
+    LOG("\n");
 }

 static void print_matrix(struct ggml_tensor * probs) {
@@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) {
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
-            printf(" %.2f", p);
+            LOG(" %.2f", p);
        }
-        printf("\n");
+        LOG("\n");
    }
 }

-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
    FILE * fp;
@@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
    if (is_ggml_file(filename)) {
+        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
        struct ggml_context * ctx_data = NULL;

        struct gguf_init_params params = {
@@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);

        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
+        if (n_vocab != static_cast<uint32_t>(config->vocab_size)) {
+            die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size);
+        }

        vocab->id_to_token.resize(n_vocab);

@@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
        gguf_free(ctx);
    } else {
        // assume llama2.c vocabulary
-        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
+        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
        llama_file file(filename, "rb");
        if (!file.fp) {
            die_fmt("%s: %s", strerror(errno), filename);
@@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
 }

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
-    int ct;
-    switch (ggml_n_dims(gg_weights)) {
-        case 1:
-            ct = 0;
-            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
-                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
-                *ptr = karpathy_weights[ct];
-                ct++;
-            }
-            break;
-        case 2:
-            ct = 0;
-            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
-                    *ptr = karpathy_weights[ct];
-                    ct++;
-                }
-            }
-            break;
-        case 3:
-            ct = 0;
-            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
-                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
-                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
-                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
-                        *ptr = karpathy_weights[ct];
-                        ct++;
-                    }
-                }
-            }
-            break;
+    int size = 1;
+    for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) {
+        size *= gg_weights->ne[dim];
+    }
+    for (int ct = 0; ct < size; ++ct) {
+        int64_t i0 = 0; int64_t i1 = 0;
+        int64_t i2 = 0; int64_t i3 = 0;
+        ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3);
+        ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]);
    }
 }

@@ -679,16 +635,18 @@ static void save_as_llama_model(
    // convert AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data());
+    convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data());

-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
+    convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data());
    //print_row(model->norm, 0);

    // for rms-att-weight
    int row_length = model->hparams.n_embd;
    int n_ff = model->hparams.n_ff;

+    const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv;
+
    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
@@ -697,9 +655,10 @@ static void save_as_llama_model(

        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries
+        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length/n_multiqueries]);
+        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length/n_multiqueries]);

        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
@@ -736,8 +695,8 @@ static void save_as_llama_model(
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
-    // n_head_kv is optional, default to n_head
-    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv);
    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
@@ -789,12 +748,12 @@ static void save_as_llama_model(

 static struct train_params get_default_train_params() {
    struct train_params params;
-    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
+    params.fn_vocab_model          = "models/7B/ggml-model-f16.gguf";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
-    params.fn_train_data     = "shakespeare.txt";
-    params.fn_checkpoint_in  = "checkpoint.bin";
-    params.fn_checkpoint_out = "checkpoint.bin";
-    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+    params.fn_train_data           = "shakespeare.txt";
+    params.fn_checkpoint_in        = "checkpoint.bin";
+    params.fn_checkpoint_out       = "checkpoint.bin";
+    params.fn_model_out            = "ggml-checkpoint-f32.bin";

    params.seed       =   -1;

@@ -829,8 +788,8 @@ static struct train_params get_default_train_params() {
    params.adam_alpha        = 1e-3f;
    params.adam_decay        = 1e-3f;

-    params.mem_model_gb   = 2;
-    params.mem_compute_gb = 24;
+    params.mem_model_gb    = 2;
+    params.mem_compute_gb  = 24;
    params.mem_compute0_gb = 8;
    params.mem_compute1_gb = 2;

@@ -916,19 +875,30 @@ int main(int argc, char ** argv) {
    if (!params_parse(argc, argv, &params)) {
        return 1;
    }
+    log_set_target(stdout);
    Config config;
    TransformerWeights weights = {};
    {
-        FILE *file = fopen(params.fn_llama2c_model, "rb");
-        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        FILE *file = fopen(params.fn_llama2c_model, "r");
+        if (!file) {
+            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            return 1;
+        }
        // read in the config header
-        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        if (fread(&config, sizeof(Config), 1, file) != 1) {
+            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            return 1;
+        }
        auto shared_weights = config.vocab_size > 0;
        config.vocab_size = abs(config.vocab_size);

        // read in the Transformer weights
-        malloc_weights(&weights, &config, shared_weights);
-        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
+        alloc_weights(&weights, &config, shared_weights);
+        if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
+            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            return 1;
+        }
        fclose(file);
    }

@@ -936,15 +906,18 @@ int main(int argc, char ** argv) {
    load_vocab(params.fn_vocab_model, &config, &vocab);

    struct my_llama_model model;
-    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
-    model.hparams.n_embd  = config.dim; //params.n_embd;
-    model.hparams.n_ff    = config.hidden_dim;
-    model.hparams.n_mult  = 32;//params.n_mult;
-    model.hparams.n_head  = config.n_heads; //params.n_head;
-    model.hparams.n_layer = config.n_layers; //params.n_layer;
-    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    model.hparams.n_vocab   = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx     = params.n_ctx;
+    model.hparams.n_embd    = config.dim; //params.n_embd;
+    model.hparams.n_ff      = config.hidden_dim;
+    model.hparams.n_mult    = 32;//params.n_mult;
+    model.hparams.n_head    = config.n_heads; //params.n_head;
+    model.hparams.n_head_kv = config.n_kv_heads;
+    model.hparams.n_layer   = config.n_layers; //params.n_layer;
+    model.hparams.n_rot     = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+
    print_params(&model.hparams);
+
    struct ggml_init_params lcparams;
    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
    lcparams.mem_buffer = NULL;
@@ -956,7 +929,7 @@ int main(int argc, char ** argv) {
    model.name = basename(params.fn_llama2c_model);
    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);

-    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
    return 0;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -61,6 +61,8 @@ int main(int argc, char ** argv) {
    }

    params.embedding = true;
+    // For non-causal models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;

    print_build_info();

@@ -114,7 +116,9 @@ int main(int argc, char ** argv) {
    for (const auto & prompt : prompts) {
        auto inp = ::llama_tokenize(ctx, prompt, true, false);
        if (inp.size() > n_batch) {
-            inp.resize(n_batch);
+            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
        }
        inputs.push_back(inp);
    }
--- a/examples/gguf-split/CMakeLists.txt
+++ b/examples/gguf-split/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET gguf-split)
+add_executable(${TARGET} gguf-split.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf-split/README.md
+++ b/examples/gguf-split/README.md
@@ -0,0 +1,9 @@
+## GGUF split Example
+
+CLI to split / merge GGUF files.
+
+**Command line options:**
+
+- `--split`: split GGUF to multiple GGUF, default operation.
+- `--split-max-tensors`: maximum tensors in each split: default(128)
+- `--merge`: merge multiple GGUF to a single GGUF.
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -0,0 +1,468 @@
+#include "llama.h"
+#include "common.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include <stdio.h>
+#include <string.h>
+#include <climits>
+#include <stdexcept>
+
+#if defined(_WIN32)
+    #include <windows.h>
+    #ifndef PATH_MAX
+        #define PATH_MAX MAX_PATH
+    #endif
+    #include <io.h>
+#endif
+
+enum split_operation : uint8_t {
+    SPLIT_OP_SPLIT,
+    SPLIT_OP_MERGE,
+};
+
+struct split_params {
+    split_operation operation = SPLIT_OP_SPLIT;
+    int n_split_tensors = 128;
+    std::string input;
+    std::string output;
+};
+
+static void split_print_usage(const char * executable) {
+    const split_params default_params;
+    printf("\n");
+    printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable);
+    printf("\n");
+    printf("Apply a GGUF operation on IN to OUT.");
+    printf("\n");
+    printf("options:\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  --version             show version and build info\n");
+    printf("  --split               split GGUF to multiple GGUF (default)\n");
+    printf("  --split-max-tensors   max tensors in each split: default(%d)\n", default_params.n_split_tensors);
+    printf("  --merge               merge multiple GGUF to a single GGUF\n");
+    printf("\n");
+}
+
+static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) {
+    std::string arg;
+    const std::string arg_prefix = "--";
+    bool invalid_param = false;
+
+    int arg_idx = 1;
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        arg = argv[arg_idx];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        bool arg_found = false;
+        if (arg == "-h" || arg == "--help") {
+            split_print_usage(argv[0]);
+            exit(0);
+        }
+        if (arg == "--version") {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
+        }
+
+        if (arg == "--merge") {
+            arg_found = true;
+            params.operation = SPLIT_OP_MERGE;
+        }
+        if (arg == "--split") {
+            arg_found = true;
+            params.operation = SPLIT_OP_SPLIT;
+        }
+        if (arg == "--split-max-tensors") {
+            if (++arg_idx >= argc) {
+                invalid_param = true;
+                break;
+            }
+            arg_found = true;
+            params.n_split_tensors = atoi(argv[arg_idx]);
+        }
+
+        if (!arg_found) {
+            throw std::invalid_argument("error: unknown argument: " + arg);
+        }
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+    }
+
+    if (argc - arg_idx < 2) {
+        printf("%s: bad arguments\n", argv[0]);
+        split_print_usage(argv[0]);
+        return false;
+    }
+
+    params.input = argv[arg_idx++];
+    params.output = argv[arg_idx++];
+
+    return true;
+}
+
+static bool split_params_parse(int argc, const char ** argv, split_params & params) {
+    bool result = true;
+    try {
+        if (!split_params_parse_ex(argc, argv, params)) {
+            split_print_usage(argv[0]);
+            exit(EXIT_FAILURE);
+        }
+    }
+    catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        split_print_usage(argv[0]);
+        exit(EXIT_FAILURE);
+    }
+    return result;
+}
+
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
+struct split_strategy {
+    const split_params params;
+    std::ifstream & f_input;
+    struct gguf_context * ctx_gguf;
+    struct ggml_context * ctx_meta = NULL;
+    const int n_tensors;
+
+    const int n_split;
+    int i_split = 0;
+
+    int i_tensor = 0;
+
+    std::vector<uint8_t> read_data;
+
+    struct gguf_context * ctx_out;
+    std::ofstream fout;
+
+    split_strategy(const split_params & params,
+            std::ifstream & f_input,
+            struct gguf_context * ctx_gguf,
+            struct ggml_context * ctx_meta) :
+        params(params),
+        f_input(f_input),
+        ctx_gguf(ctx_gguf),
+        ctx_meta(ctx_meta),
+        n_tensors(gguf_get_n_tensors(ctx_gguf)),
+        n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) {
+        }
+
+    bool should_split() const {
+        return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
+    }
+
+    void split_start() {
+        ctx_out = gguf_init_empty();
+
+        // Save all metadata in first split only
+        if (i_split == 0) {
+            gguf_set_kv(ctx_out, ctx_gguf);
+        }
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split);
+        gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, n_split);
+        gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors);
+
+        // populate the original tensors, so we get an initial metadata
+        for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) {
+            struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            gguf_add_tensor(ctx_out, meta);
+        }
+
+        char split_path[PATH_MAX] = {0};
+        llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split);
+
+        fprintf(stderr, "%s: %s ...", __func__, split_path);
+        fout = std::ofstream(split_path, std::ios::binary);
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+        auto meta_size = gguf_get_meta_size(ctx_out);
+
+        // placeholder for the meta data
+        ::zeros(fout, meta_size);
+
+        i_split++;
+    }
+
+    void next_tensor() {
+        const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+        struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+        auto n_bytes = ggml_nbytes(t);
+
+        if (read_data.size() < n_bytes) {
+            read_data.resize(n_bytes);
+        }
+
+        auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
+        f_input.seekg(offset);
+        f_input.read((char *)read_data.data(), n_bytes);
+
+        t->data = read_data.data();
+
+        // write tensor data + padding
+        fout.write((const char *)t->data, n_bytes);
+        zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+
+        i_tensor++;
+    }
+
+    void split_end() {
+        // go back to beginning of file and write the updated metadata
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *)data.data(), data.size());
+
+        fout.close();
+        gguf_free(ctx_out);
+
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+};
+
+static void gguf_split(const split_params & split_params) {
+    struct ggml_context * ctx_meta = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx_meta,
+    };
+
+    std::ifstream f_input(split_params.input.c_str(), std::ios::binary);
+    if (!f_input.is_open()) {
+        fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_params.input.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params);
+    if (!ctx_gguf) {
+        fprintf(stderr, "%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
+        exit(EXIT_FAILURE);
+    }
+
+    split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta);
+
+    char first_split_path[PATH_MAX] = {0};
+    llama_split_path(first_split_path, sizeof(first_split_path),
+                     split_params.output.c_str(), strategy.i_split, strategy.n_split);
+    fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n",
+            __func__, split_params.input.c_str(),
+            first_split_path,
+            split_params.n_split_tensors);
+
+    strategy.split_start();
+
+    while (strategy.i_tensor < strategy.n_tensors) {
+        strategy.next_tensor();
+        if (strategy.should_split()) {
+            strategy.split_end();
+            strategy.split_start();
+        }
+    }
+    strategy.split_end();
+
+    gguf_free(ctx_gguf);
+    f_input.close();
+
+    fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n",
+            __func__, strategy.n_split, strategy.n_tensors);
+}
+
+static void gguf_merge(const split_params & split_params) {
+    fprintf(stderr, "%s: %s -> %s\n",
+            __func__, split_params.input.c_str(),
+            split_params.output.c_str());
+    int n_split = 1;
+    int total_tensors = 0;
+
+    auto * ctx_out = gguf_init_empty();
+    std::ofstream fout(split_params.output.c_str(), std::ios::binary);
+    fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+
+    std::vector<uint8_t> read_data;
+    std::vector<ggml_context *> ctx_metas;
+    std::vector<gguf_context *> ctx_ggufs;
+
+    char split_path[PATH_MAX] = {0};
+    strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1);
+    char split_prefix[PATH_MAX] = {0};
+
+    // First pass to find KV and tensors metadata
+    for (int i_split = 0; i_split < n_split; i_split++) {
+        struct ggml_context * ctx_meta = NULL;
+
+        struct gguf_init_params params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ &ctx_meta,
+        };
+
+        if (i_split > 0) {
+            llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        }
+        fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path);
+
+        auto * ctx_gguf = gguf_init_from_file(split_path, params);
+        if (!ctx_gguf) {
+            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, split_params.input.c_str());
+            exit(EXIT_FAILURE);
+        }
+        ctx_ggufs.push_back(ctx_gguf);
+        ctx_metas.push_back(ctx_meta);
+
+        if (i_split == 0) {
+            auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+            if (key_n_split < 0) {
+                fprintf(stderr,
+                        "\n%s: input file does not contain %s metadata\n",
+                        __func__,
+                        LLM_KV_SPLIT_COUNT);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
+            }
+
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+            if (n_split < 1) {
+                fprintf(stderr,
+                        "\n%s: input file does not contain a valid split count %d\n",
+                        __func__,
+                        n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
+            }
+
+            // Verify the file naming and extract split_prefix
+            if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) {
+                fprintf(stderr, "\n%s: unexpected input file name: %s"
+                                " i_split=%d"
+                                " n_split=%d\n", __func__,
+                        split_path, i_split, n_split);
+                gguf_free(ctx_gguf);
+                ggml_free(ctx_meta);
+                gguf_free(ctx_out);
+                fout.close();
+                exit(EXIT_FAILURE);
+            }
+
+            // Do not trigger merge if we try to merge again the output
+            gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0);
+
+            // Set metadata from the first split
+            gguf_set_kv(ctx_out, ctx_gguf);
+        }
+
+        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
+            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+            gguf_add_tensor(ctx_out, t);
+        }
+        total_tensors += n_tensors;
+
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+
+    // placeholder for the meta data
+    {
+        auto meta_size = gguf_get_meta_size(ctx_out);
+        ::zeros(fout, meta_size);
+    }
+
+    // Write tensors data
+    for (int i_split = 0; i_split < n_split; i_split++) {
+        llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split);
+        std::ifstream f_input(split_path, std::ios::binary);
+        if (!f_input.is_open()) {
+            fprintf(stderr, "%s:  failed to open input GGUF from %s\n", __func__, split_path);
+            for (uint32_t i = 0; i < ctx_ggufs.size(); i++) {
+                gguf_free(ctx_ggufs[i]);
+                ggml_free(ctx_metas[i]);
+            }
+            gguf_free(ctx_out);
+            fout.close();
+            exit(EXIT_FAILURE);
+        }
+        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
+
+        auto * ctx_gguf = ctx_ggufs[i_split];
+        auto * ctx_meta = ctx_metas[i_split];
+
+        auto n_tensors = gguf_get_n_tensors(ctx_gguf);
+        for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) {
+            const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name);
+
+            auto n_bytes = ggml_nbytes(t);
+
+            if (read_data.size() < n_bytes) {
+                read_data.resize(n_bytes);
+            }
+
+            auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
+            f_input.seekg(offset);
+            f_input.read((char *)read_data.data(), n_bytes);
+
+            // write tensor data + padding
+            fout.write((const char *)read_data.data(), n_bytes);
+            zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
+        }
+
+        gguf_free(ctx_gguf);
+        ggml_free(ctx_meta);
+        f_input.close();
+        fprintf(stderr, "\033[3Ddone\n");
+    }
+
+    {
+        // go back to beginning of file and write the updated metadata
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *)data.data(), data.size());
+
+        fout.close();
+        gguf_free(ctx_out);
+    }
+
+    fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
+            __func__, split_params.output.c_str(), n_split, total_tensors);
+}
+
+int main(int argc, const char ** argv) {
+    if (argc < 3) {
+        split_print_usage(argv[0]);
+    }
+
+    split_params params;
+    split_params_parse(argc, argv, params);
+
+    switch (params.operation) {
+        case SPLIT_OP_SPLIT: gguf_split(params);
+            break;
+        case SPLIT_OP_MERGE: gguf_merge(params);
+            break;
+        default: split_print_usage(argv[0]);
+            exit(EXIT_FAILURE);
+    }
+
+    return 0;
+}
--- a/examples/gritlm/README.md
+++ b/examples/gritlm/README.md
@@ -0,0 +1,62 @@
+## Generative Representational Instruction Tuning (GRIT) Example
+[gritlm] a model which can generate embeddings as well as "normal" text
+generation depending on the instructions in the prompt.
+
+* Paper: https://arxiv.org/pdf/2402.09906.pdf
+
+### Retrieval-Augmented Generation (RAG) use case
+One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
+that we take documents that we want to use as context, to ground the large
+language model (LLM), and we create token embeddings for them. We then store
+these token embeddings in a vector database.
+
+When we perform a query, prompt the LLM, we will first create token embeddings
+for the query and then search the vector database to retrieve the most
+similar vectors, and return those documents so they can be passed to the LLM as
+context. Then the query and the context will be passed to the LLM which will
+have to _again_ create token embeddings for the query. But because gritlm is used
+the first query can be cached and the second query tokenization generation does
+not have to be performed at all.
+
+### Running the example
+Download a Grit model:
+```console
+$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
+```
+
+Run the example using the downloaded model:
+```console
+$ ./gritlm -m gritlm-7b_q4_1.gguf
+
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
+Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
+Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
+Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+When shadows lurk and ghosts do roam,
+And darkness reigns, a fearsome sight.
+
+Thou didst set out, with heart aglow,
+To conquer this mountain, so high,
+And reach the summit, where the stars do glow,
+And the moon shines bright, up in the sky.
+
+Through the mist and fog, thou didst press on,
+With steadfast courage, and a steadfast will,
+Through the darkness, thou didst not be gone,
+But didst climb on, with a steadfast skill.
+
+At last, thou didst reach the summit's crest,
+And gazed upon the world below,
+And saw the beauty of the night's best,
+And felt the peace, that only nature knows.
+
+Oh, brave adventurer, who dared to climb
+The lofty peak of Mt. Fuji in the night,
+Thou art a hero, in the eyes of all,
+For thou didst conquer this mountain, so bright.
+```
+
+[gritlm]: https://github.com/ContextualAI/gritlm
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example

 ```bash
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j

 # generate importance matrix (imatrix.dat)
 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -50,11 +50,31 @@ private:
    void keep_imatrix(int ncall) const;
 };

+// remove any prefix and suffixes from the name
+// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
+static std::string filter_tensor_name(const char * name) {
+    std::string wname;
+    const char * p = strchr(name, '#');
+    if (p != NULL) {
+        p = p + 1;
+        const char * q = strchr(p, '#');
+        if (q != NULL) {
+            wname = std::string(p, q - p);
+        } else {
+            wname = p;
+        }
+    } else {
+        wname = name;
+    }
+    return wname;
+}
+
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    GGML_UNUSED(user_data);

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
+    std::string wname = filter_tensor_name(src0->name);

    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
@@ -62,7 +82,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
        return true;
    }

@@ -94,12 +114,13 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
        // this is necessary to guarantee equal number of "ncall" for each tensor
        for (int ex = 0; ex < n_as; ++ex) {
            src0 = t->src[2 + ex];
-            auto& e = m_stats[src0->name];
+            wname = filter_tensor_name(src0->name);
+            auto& e = m_stats[wname];
            if (e.values.empty()) {
                e.values.resize(src1->ne[0], 0);
            }
            else if (e.values.size() != (size_t)src1->ne[0]) {
-                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
                exit(1); //GGML_ASSERT(false);
            }
            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
@@ -107,7 +128,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
            ++e.ncall;
            if (m_params.verbosity > 1) {
-                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
            }
            for (int row = 0; row < (int)src1->ne[1]; ++row) {
                const int excur = m_ids[row*n_as + idx];
@@ -129,17 +150,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            }
        }
    } else {
-        auto& e = m_stats[src0->name];
+        auto& e = m_stats[wname];
        if (e.values.empty()) {
            e.values.resize(src1->ne[0], 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
            exit(1); //GGML_ASSERT(false);
        }
        ++e.ncall;
        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
        }
        for (int row = 0; row < (int)src1->ne[1]; ++row) {
            const float * x = data + row * src1->ne[0];
@@ -403,6 +424,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

+            // TODO: use batch.logits to save computations instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return false;
--- a/examples/json-schema-pydantic-example.py
+++ b/examples/json-schema-pydantic-example.py
@@ -0,0 +1,74 @@
+# Usage:
+#! ./server -m some-model.gguf &
+#! pip install pydantic
+#! python json-schema-pydantic-example.py
+
+from pydantic import BaseModel, TypeAdapter
+from annotated_types import MinLen
+from typing import Annotated, List, Optional
+import json, requests
+
+if True:
+
+    def create_completion(*, response_model=None, endpoint="http://localhost:8080/v1/chat/completions", messages, **kwargs):
+        '''
+        Creates a chat completion using an OpenAI-compatible endpoint w/ JSON schema support
+        (llama.cpp server, llama-cpp-python, Anyscale / Together...)
+
+        The response_model param takes a type (+ supports Pydantic) and behaves just as w/ Instructor (see below)
+        '''
+        if response_model:
+            type_adapter = TypeAdapter(response_model)
+            schema = type_adapter.json_schema()
+            messages = [{
+                "role": "system",
+                "content": f"You respond in JSON format with the following schema: {json.dumps(schema, indent=2)}"
+            }] + messages
+            response_format={"type": "json_object", "schema": schema}
+
+        data = requests.post(endpoint, headers={"Content-Type": "application/json"},
+                             json=dict(messages=messages, response_format=response_format, **kwargs)).json()
+        if 'error' in data:
+            raise Exception(data['error']['message'])
+
+        content = data["choices"][0]["message"]["content"]
+        return type_adapter.validate_json(content) if type_adapter else content
+
+else:
+
+    # This alternative branch uses Instructor + OpenAI client lib.
+    # Instructor support streamed iterable responses, retry & more.
+    # (see https://python.useinstructor.com/)
+    #! pip install instructor openai
+    import instructor, openai
+    client = instructor.patch(
+        openai.OpenAI(api_key="123", base_url="http://localhost:8080"),
+        mode=instructor.Mode.JSON_SCHEMA)
+    create_completion = client.chat.completions.create
+
+
+if __name__ == '__main__':
+
+    class QAPair(BaseModel):
+        question: str
+        concise_answer: str
+        justification: str
+
+    class PyramidalSummary(BaseModel):
+        title: str
+        summary: str
+        question_answers: Annotated[List[QAPair], MinLen(2)]
+        sub_sections: Optional[Annotated[List['PyramidalSummary'], MinLen(2)]]
+
+    print("# Summary\n", create_completion(
+        model="...",
+        response_model=PyramidalSummary,
+        messages=[{
+            "role": "user",
+            "content": f"""
+                You are a highly efficient corporate document summarizer.
+                Create a pyramidal summary of an imaginary internal document about our company processes
+                (starting high-level, going down to each sub sections).
+                Keep questions short, and answers even shorter (trivia / quizz style).
+            """
+        }]))
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 import argparse
+import itertools
 import json
 import re
 import sys
+from typing import Any, Dict, List, Set, Tuple, Union

 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
@@ -12,26 +14,54 @@ PRIMITIVE_RULES = {
    'boolean': '("true" | "false") space',
    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
+    'value'  : 'object | array | string | number | boolean',
+    'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+    'array'  : '"[" space ( value ("," space value)* )? "]" space',
+    'uuid'   : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space',
    'string': r''' "\"" (
        [^"\\] |
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-      )* "\"" space ''',
+      )* "\"" space''',
    'null': '"null" space',
 }
+OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value']
+
+# TODO: support "uri", "email" string formats
+DATE_RULES = {
+    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+    'date-time': 'date "T" time',
+    'date-string': '"\\"" date "\\"" space',
+    'time-string': '"\\"" time "\\"" space',
+    'date-time-string': '"\\"" date-time "\\"" space',
+}
+
+RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()])

 INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
 GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
-GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
+GRAMMAR_RANGE_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"\]\-\\]')
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]'}

+NON_LITERAL_SET = set('|.()[]{}*+?')
+ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?')
+
+DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])'
+TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits

 class SchemaConverter:
-    def __init__(self, prop_order):
+    def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern):
        self._prop_order = prop_order
+        self._allow_fetch = allow_fetch
+        self._dotall = dotall
+        self._raw_pattern = raw_pattern
        self._rules = {'space': SPACE_RULE}
+        self._refs = {}
+        self._refs_being_resolved = set()

    def _format_literal(self, literal):
        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
-            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal
        )
        return f'"{escaped}"'

@@ -41,78 +71,420 @@ class SchemaConverter:
            key = esc_name
        else:
            i = 0
-            while f'{esc_name}{i}' in self._rules:
+            while f'{esc_name}{i}' in self._rules and self._rules[f'{esc_name}{i}'] != rule:
                i += 1
            key = f'{esc_name}{i}'
        self._rules[key] = rule
        return key

+    def resolve_refs(self, schema: dict, url: str):
+        '''
+            Resolves all $ref fields in the given schema, fetching any remote schemas,
+            replacing $ref with absolute reference URL and populating self._refs with the
+            respective referenced (sub)schema dictionaries.
+        '''
+        def visit(n: dict):
+            if isinstance(n, list):
+                return [visit(x) for x in n]
+            elif isinstance(n, dict):
+                ref = n.get('$ref')
+                if ref is not None and ref not in self._refs:
+                    if ref.startswith('https://'):
+                        assert self._allow_fetch, 'Fetching remote schemas is not allowed (use --allow-fetch for force)'
+                        import requests
+
+                        frag_split = ref.split('#')
+                        base_url = frag_split[0]
+
+                        target = self._refs.get(base_url)
+                        if target is None:
+                            target = self.resolve_refs(requests.get(ref).json(), base_url)
+                            self._refs[base_url] = target
+
+                        if len(frag_split) == 1 or frag_split[-1] == '':
+                            return target
+                    elif ref.startswith('#/'):
+                        target = schema
+                        ref = f'{url}{ref}'
+                        n['$ref'] = ref
+                    else:
+                        raise ValueError(f'Unsupported ref {ref}')
+
+                    for sel in ref.split('#')[-1].split('/')[1:]:
+                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                        target = target[sel]
+
+                    self._refs[ref] = target
+                else:
+                    for v in n.values():
+                        visit(v)
+
+            return n
+        return visit(schema)
+
+    def _generate_union_rule(self, name, alt_schemas):
+        return ' | '.join((
+            self.visit(alt_schema, f'{name}{"-" if name else "alternative-"}{i}')
+            for i, alt_schema in enumerate(alt_schemas)
+        ))
+
+    def _visit_pattern(self, pattern, name):
+        '''
+            Transforms a regular expression pattern into a GBNF rule.
+
+            Input: https://json-schema.org/understanding-json-schema/reference/regular_expressions
+            Output: https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+
+            Unsupported features: negative/positive lookaheads, greedy/non-greedy modifiers.
+
+            Mostly a 1:1 translation, except for {x} / {x,} / {x,y} quantifiers for which
+            we define sub-rules to keep the output lean.
+        '''
+
+        assert pattern.startswith('^') and pattern.endswith('$'), 'Pattern must start with "^" and end with "$"'
+        pattern = pattern[1:-1]
+        sub_rule_ids = {}
+
+        i = 0
+        length = len(pattern)
+
+        def to_rule(s: Tuple[str, bool]) -> str:
+            (txt, is_literal) = s
+            return "\"" + txt + "\"" if is_literal else txt
+
+        def transform() -> Tuple[str, bool]:
+            '''
+                Parse a unit at index i (advancing it), and return its string representation + whether it's a literal.
+            '''
+            nonlocal i
+            nonlocal pattern
+            nonlocal sub_rule_ids
+
+            start = i
+            # For each component of this sequence, store its string representation and whether it's a literal.
+            # We only need a flat structure here to apply repetition operators to the last item, and
+            # to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+            # (GBNF's syntax is luckily very close to regular expressions!)
+            seq: list[Tuple[str, bool]] = []
+
+            def get_dot():
+                if self._dotall:
+                    rule = '[\\U00000000-\\U0010FFFF]'
+                else:
+                    # Accept any character... except \n and \r line break chars (\x0A and \xOD)
+                    rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]'
+                return self._add_rule(f'dot', rule)
+
+            def join_seq():
+                nonlocal seq
+                ret = []
+                for is_literal, g in itertools.groupby(seq, lambda x: x[1]):
+                    if is_literal:
+                        ret.append((''.join(x[0] for x in g), True))
+                    else:
+                        ret.extend(g)
+                if len(ret) == 1:
+                    return ret[0]
+                return (' '.join(to_rule(x) for x in seq), False)
+
+            while i < length:
+                c = pattern[i]
+                if c == '.':
+                    seq.append((get_dot(), False))
+                    i += 1
+                elif c == '(':
+                    i += 1
+                    if i < length:
+                        assert pattern[i] != '?', f'Unsupported pattern syntax "{pattern[i]}" at index {i} of /{pattern}/'
+                    seq.append((f'({to_rule(transform())})', False))
+                elif c == ')':
+                    i += 1
+                    assert start > 0 and pattern[start-1] == '(', f'Unbalanced parentheses; start = {start}, i = {i}, pattern = {pattern}'
+                    return join_seq()
+                elif c == '[':
+                    square_brackets = c
+                    i += 1
+                    while i < length and pattern[i] != ']':
+                        if pattern[i] == '\\':
+                            square_brackets += pattern[i:i+2]
+                            i += 2
+                        else:
+                            square_brackets += pattern[i]
+                            i += 1
+                    assert i < length, f'Unbalanced square brackets; start = {start}, i = {i}, pattern = {pattern}'
+                    square_brackets += ']'
+                    i += 1
+                    seq.append((square_brackets, False))
+                elif c == '|':
+                    seq.append(('|', False))
+                    i += 1
+                elif c in ('*', '+', '?'):
+                    seq[-1] = (to_rule(seq[-1]) + c, False)
+                    i += 1
+                elif c == '{':
+                    curly_brackets = c
+                    i += 1
+                    while i < length and pattern[i] != '}':
+                        curly_brackets += pattern[i]
+                        i += 1
+                    assert i < length, f'Unbalanced curly brackets; start = {start}, i = {i}, pattern = {pattern}'
+                    curly_brackets += '}'
+                    i += 1
+                    nums = [s.strip() for s in curly_brackets[1:-1].split(',')]
+                    min_times = 0
+                    max_times = None
+                    try:
+                        if len(nums) == 1:
+                            min_times = int(nums[0])
+                            max_times = min_times
+                        else:
+                            assert len(nums) == 2
+                            min_times = int(nums[0]) if nums[0] else 0
+                            max_times = int(nums[1]) if nums[1] else None
+                    except ValueError:
+                        raise ValueError(f'Invalid quantifier {curly_brackets} in /{pattern}/')
+
+                    (sub, sub_is_literal) = seq[-1]
+
+                    if min_times == 0 and max_times is None:
+                        seq[-1] = (f'{sub}*', False)
+                    elif min_times == 0 and max_times == 1:
+                        seq[-1] = (f'{sub}?', False)
+                    elif min_times == 1 and max_times is None:
+                        seq[-1] = (f'{sub}+', False)
+                    else:
+                        if not sub_is_literal:
+                            id = sub_rule_ids.get(sub)
+                            if id is None:
+                                id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub)
+                                sub_rule_ids[sub] = id
+                            sub = id
+
+                        seq[-1] = (
+                            ' '.join(
+                                ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) +
+                                ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])),
+                            False
+                        )
+                else:
+                    literal = ''
+                    while i < length:
+                        if pattern[i] == '\\' and i < length - 1:
+                            next = pattern[i + 1]
+                            if next in ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS:
+                                i += 1
+                                literal += pattern[i]
+                                i += 1
+                            else:
+                                literal += pattern[i:i+2]
+                                i += 2
+                        elif pattern[i] == '"' and not self._raw_pattern:
+                            literal += '\\"'
+                            i += 1
+                        elif pattern[i] not in NON_LITERAL_SET and \
+                                (i == length - 1 or literal == '' or pattern[i+1] == '.' or pattern[i+1] not in NON_LITERAL_SET):
+                            literal += pattern[i]
+                            i += 1
+                        else:
+                            break
+                    if literal:
+                        seq.append((literal, True))
+
+            return join_seq()
+
+        return self._add_rule(
+            name,
+            to_rule(transform()) if self._raw_pattern \
+                else "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space")
+
+
+    def _resolve_ref(self, ref):
+        ref_name = ref.split('/')[-1]
+        if ref_name not in self._rules and ref not in self._refs_being_resolved:
+            self._refs_being_resolved.add(ref)
+            resolved = self._refs[ref]
+            ref_name = self.visit(resolved, ref_name)
+            self._refs_being_resolved.remove(ref)
+        return ref_name
+
+    def _generate_constant_rule(self, value):
+        return self._format_literal(json.dumps(value))
+
    def visit(self, schema, name):
        schema_type = schema.get('type')
-        rule_name = name or 'root'
+        schema_format = schema.get('format')
+        rule_name = name + '-' if name in RESERVED_NAMES else name or 'root'

-        if 'oneOf' in schema or 'anyOf' in schema:
-            rule = ' | '.join((
-                self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
-                for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
-            ))
-            return self._add_rule(rule_name, rule)
+        if (ref := schema.get('$ref')) is not None:
+            return self._add_rule(rule_name, self._resolve_ref(ref))
+
+        elif 'oneOf' in schema or 'anyOf' in schema:
+            return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
+
+        elif isinstance(schema_type, list):
+            return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))

        elif 'const' in schema:
-            return self._add_rule(rule_name, self._format_literal(schema['const']))
+            return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))

        elif 'enum' in schema:
-            rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
+            rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
            return self._add_rule(rule_name, rule)

-        elif schema_type == 'object' and 'properties' in schema:
-            # TODO: `required` keyword
-            prop_order = self._prop_order
-            prop_pairs = sorted(
-                schema['properties'].items(),
-                # sort by position in prop_order (if specified) then by key
-                key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+        elif schema_type in (None, 'object') and \
+             ('properties' in schema or \
+              ('additionalProperties' in schema and schema['additionalProperties'] is not True)):
+            required = set(schema.get('required', []))
+            properties = list(schema.get('properties', {}).items())
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
+
+        elif schema_type in (None, 'object') and 'allOf' in schema:
+            required = set()
+            properties = []
+            hybrid_name = name
+            def add_component(comp_schema, is_required):
+                if (ref := comp_schema.get('$ref')) is not None:
+                    comp_schema = self._refs[ref]
+
+                if 'properties' in comp_schema:
+                    for prop_name, prop_schema in comp_schema['properties'].items():
+                        properties.append((prop_name, prop_schema))
+                        if is_required:
+                            required.add(prop_name)
+
+            for t in schema['allOf']:
+                if 'anyOf' in t:
+                    for tt in t['anyOf']:
+                        add_component(tt, is_required=False)
+                else:
+                    add_component(t, is_required=True)
+
+            return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=[]))
+
+        elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):
+            items = schema.get('items') or schema['prefixItems']
+            if isinstance(items, list):
+                return self._add_rule(
+                    rule_name,
+                    '"[" space ' +
+                    ' "," space '.join(
+                        self.visit(item, f'{name}{"-" if name else ""}tuple-{i}')
+                        for i, item in enumerate(items)) +
+                    ' "]" space')
+            else:
+                item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item')
+                list_item_operator = f'( "," space {item_rule_name} )'
+                successive_items = ""
+                min_items = schema.get("minItems", 0)
+                max_items = schema.get("maxItems")
+                if min_items > 0:
+                    successive_items = list_item_operator * (min_items - 1)
+                    min_items -= 1
+                if max_items is not None and max_items > min_items:
+                    successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
+                else:
+                    successive_items += list_item_operator + "*"
+                if min_items == 0:
+                    rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space'
+                else:
+                    rule = f'"[" space {item_rule_name} {successive_items} "]" space'
+                return self._add_rule(rule_name, rule)
+
+        elif schema_type in (None, 'string') and 'pattern' in schema:
+            return self._visit_pattern(schema['pattern'], rule_name)
+
+        elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''):
+            return self._add_rule(
+                'root' if rule_name == 'root' else schema_format,
+                PRIMITIVE_RULES['uuid']
            )

-            rule = '"{" space'
-            for i, (prop_name, prop_schema) in enumerate(prop_pairs):
-                prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
-                if i > 0:
-                    rule += ' "," space'
-                rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
-            rule += ' "}" space'
+        elif schema_type in (None, 'string') and schema_format in DATE_RULES:
+            for t, r in DATE_RULES.items():
+                self._add_rule(t, r)
+            return schema_format + '-string'

-            return self._add_rule(rule_name, rule)
-
-        elif schema_type == 'array' and 'items' in schema:
-            # TODO `prefixItems` keyword
-            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
-            list_item_operator = f'("," space {item_rule_name})'
-            successive_items = ""
-            min_items = schema.get("minItems", 0)
-            if min_items > 0:
-               first_item = f"({item_rule_name})"
-               successive_items = list_item_operator * (min_items - 1)
-               min_items -= 1
-            else:
-               first_item = f"({item_rule_name})?"
-            max_items = schema.get("maxItems")
-            if max_items is not None and max_items > min_items:
-                successive_items += (list_item_operator + "?") * (max_items - min_items - 1)
-            else:
-                successive_items += list_item_operator + "*"
-            rule = f'"[" space {first_item} {successive_items} "]" space'
-            return self._add_rule(rule_name, rule)
+        elif (schema_type == 'object') or (len(schema) == 0):
+            for n in OBJECT_RULE_NAMES:
+                self._add_rule(n, PRIMITIVE_RULES[n])
+            return self._add_rule(rule_name, 'object')

        else:
            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
+            # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
            return self._add_rule(
                'root' if rule_name == 'root' else schema_type,
                PRIMITIVE_RULES[schema_type]
            )

+    def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
+        prop_order = self._prop_order
+        # sort by position in prop_order (if specified) then by original order
+        sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
+
+        prop_kv_rule_names = {}
+        for prop_name, prop_schema in properties:
+            prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
+            prop_kv_rule_names[prop_name] = self._add_rule(
+                f'{name}{"-" if name else ""}{prop_name}-kv',
+                fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}'
+            )
+        required_props = [k for k in sorted_props if k in required]
+        optional_props = [k for k in sorted_props if k not in required]
+
+        if additional_properties == True or isinstance(additional_properties, dict):
+            sub_name = f'{name}{"-" if name else ""}additional'
+            value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
+            prop_kv_rule_names["*"] = self._add_rule(
+                f'{sub_name}-kv',
+                self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
+            )
+            optional_props.append("*")
+
+        rule = '"{" space '
+        rule += ' "," space '.join(prop_kv_rule_names[k] for k in required_props)
+
+        if optional_props:
+            rule += ' ('
+            if required_props:
+                rule += ' "," space ( '
+
+            def get_recursive_refs(ks, first_is_optional):
+                [k, *rest] = ks
+                kv_rule_name = prop_kv_rule_names[k]
+                if k == '*':
+                    res = self._add_rule(
+                        f'{name}{"-" if name else ""}additional-kvs',
+                        f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
+                    )
+                elif first_is_optional:
+                    res = f'( "," space {kv_rule_name} )?'
+                else:
+                    res = kv_rule_name
+                if len(rest) > 0:
+                    res += ' ' + self._add_rule(
+                        f'{name}{"-" if name else ""}{k}-rest',
+                        get_recursive_refs(rest, first_is_optional=True)
+                    )
+                return res
+
+            rule += ' | '.join(
+                get_recursive_refs(optional_props[i:], first_is_optional=False)
+                for i in range(len(optional_props))
+            )
+            if required_props:
+                rule += ' )'
+            rule += ' )?'
+
+        rule += ' "}" space'
+
+        return rule
+
    def format_grammar(self):
-        return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
+        return '\n'.join(
+            f'{name} ::= {rule}'
+            for name, rule in sorted(self._rules.items(), key=lambda kv: kv[0])
+        )


 def main(args_in = None):
@@ -129,16 +501,47 @@ def main(args_in = None):
        type=lambda s: s.split(','),
        help='''
            comma-separated property names defining the order of precedence for object properties;
-            properties not specified here are given lower precedence than those that are, and are
-            sorted alphabetically
+            properties not specified here are given lower precedence than those that are, and
+            are kept in their original order from the schema. Required properties are always
+            given precedence over optional properties.
        '''
    )
+    parser.add_argument(
+        '--allow-fetch',
+        action='store_true',
+        default=False,
+        help='Whether to allow fetching referenced schemas over HTTPS')
+    parser.add_argument(
+        '--dotall',
+        action='store_true',
+        default=False,
+        help='Whether to treat dot (".") as matching all chars including line breaks in regular expression patterns')
+    parser.add_argument(
+        '--raw-pattern',
+        action='store_true',
+        default=False,
+        help='Treats string patterns as raw patterns w/o quotes (or quote escapes)')
+
    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
    args = parser.parse_args(args_in)

-    schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
-    prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
-    converter = SchemaConverter(prop_order)
+    if args.schema.startswith('https://'):
+        url = args.schema
+        import requests
+        schema = requests.get(url).json()
+    elif args.schema == '-':
+        url = 'stdin'
+        schema = json.load(sys.stdin)
+    else:
+        url = f'file://{args.schema}'
+        with open(args.schema) as f:
+            schema = json.load(f)
+    converter = SchemaConverter(
+        prop_order={name: idx for idx, name in enumerate(args.prop_order)},
+        allow_fetch=args.allow_fetch,
+        dotall=args.dotall,
+        raw_pattern=args.raw_pattern)
+    schema = converter.resolve_refs(schema, url)
    converter.visit(schema, '')
    print(converter.format_grammar())

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -113,11 +113,11 @@ static std::string get_cpu_info() {

 static std::string get_gpu_info() {
    std::string id;
-#ifdef GGML_USE_CUBLAS
-    int count = ggml_cuda_get_device_count();
+#ifdef GGML_USE_CUDA
+    int count = ggml_backend_cuda_get_device_count();
    for (int i = 0; i < count; i++) {
        char buf[128];
-        ggml_cuda_get_device_description(i, buf, sizeof(buf));
+        ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
        id += buf;
        if (i < count - 1) {
            id += "/";
@@ -249,6 +249,9 @@ static ggml_type ggml_type_from_name(const std::string & s) {
    if (s == "q5_1") {
        return GGML_TYPE_Q5_1;
    }
+    if (s == "iq4_nl") {
+        return GGML_TYPE_IQ4_NL;
+    }

    return GGML_TYPE_COUNT;
 }
@@ -805,7 +808,7 @@ struct test {

 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
-const bool        test::cuda         = !!ggml_cpu_has_cublas();
+const bool        test::cuda         = !!ggml_cpu_has_cuda();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -1,11 +1,13 @@
 # MobileVLM

-Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
+Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants.

 for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)

 The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.

+Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion  is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown.
+
 ## Usage
 Build with cmake or run `make llava-cli` to build it.

@@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
 ```

-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
+3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:

 ```sh
 python ./examples/llava/convert-image-encoder-to-gguf \
@@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \
    --projector-type ldp
 ```

+```sh
+python ./examples/llava/convert-image-encoder-to-gguf \
+    -m path/to/clip-vit-large-patch14-336 \
+    --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
+    --output-dir path/to/MobileVLM-1.7B_V2 \
+    --projector-type ldpv2
+```
+
 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:

 ```sh
@@ -114,7 +124,7 @@ llama_print_timings:       total time =   34570.79 ms
 ## Orin compile and run
 ### compile
 ```sh
-make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
+make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
 ```

 ### run on Orin
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -7,7 +7,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"

-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif

@@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_LLAVA_PROJ      "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"


@@ -126,12 +127,14 @@ enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_LDPV2,
    PROJECTOR_TYPE_UNKNOWN,
 };

 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP, "mlp" },
    { PROJECTOR_TYPE_LDP, "ldp" },
+    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
 };


@@ -475,6 +478,14 @@ struct clip_vision_model {
    struct ggml_tensor * mm_model_block_2_block_2_0_w;
    struct ggml_tensor * mm_model_block_2_block_2_1_w;
    struct ggml_tensor * mm_model_block_2_block_2_1_b;
+
+    // MobileVLM_V2 projection
+    struct ggml_tensor * mm_model_mlp_0_w;
+    struct ggml_tensor * mm_model_mlp_0_b;
+    struct ggml_tensor * mm_model_mlp_2_w;
+    struct ggml_tensor * mm_model_mlp_2_b;
+    struct ggml_tensor * mm_model_peg_0_w;
+    struct ggml_tensor * mm_model_peg_0_b;
 };

 struct clip_ctx {
@@ -497,7 +508,6 @@ struct clip_ctx {

    // memory buffers to evaluate the model
    ggml_backend_buffer_t params_buffer  = NULL;
-    ggml_backend_buffer_t compute_buffer = NULL;

    ggml_backend_t backend       = NULL;
    ggml_gallocr_t compute_alloc = NULL;
@@ -808,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            }
            embeddings = block_1;
        }
+        else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            int n_patch = 24;
+            struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+            mlp_0 = ggml_gelu(ctx0, mlp_0);
+            struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+            // mlp_2 ne = [2048, 576, 1, 1]
+            // // AVG Pool Layer 2*2, strides = 2
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
+            // mlp_2 ne = [576, 2048, 1, 1]
+            mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+            // mlp_2 ne [24, 24, 2048, 1]
+            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+            // weight ne = [3, 3, 2048, 1]
+            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+            embeddings = peg_0;
+        }
        else {
            GGML_ASSERT(false);
        }
@@ -935,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
    }

-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
 #endif
@@ -1178,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.mm_model_block_2_block_2_0_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
            vision_model.mm_model_block_2_block_2_1_w   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
            vision_model.mm_model_block_2_block_2_1_b   = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
-        } else {
+        }
+        else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            // MobilVLM_V2 projection
+            vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight"));
+            vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias"));
+            vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight"));
+            vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias"));
+            vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
+            vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
+        }
+        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
        }
@@ -1235,16 +1279,16 @@ struct clip_image_f32 * clip_image_f32_init() {

 void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
 void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
 }
-void clip_image_f32_batch_free(struct clip_image_f32_batch  & batch) {
-    if (batch.size > 0) {
-        delete[] batch.data;
-        batch.size = 0;
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
 }

@@ -1497,7 +1541,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im

 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
@@ -1509,11 +1553,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
        pad_to_square = false;
    }
    // free the previous res_imgs if any set
-    if (res_imgs.size > 0) {
+    if (res_imgs->size > 0) {
        clip_image_f32_batch_free(res_imgs);
    }
-    res_imgs.data = nullptr;
-    res_imgs.size = 0;
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;

    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1568,11 +1612,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
            patches.insert(patches.begin(), image_original_resize);
            // clip_image_f32_batch_init(patches.size());
-            res_imgs.size = patches.size();
-            res_imgs.data = new clip_image_f32[res_imgs.size];
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
            int num=0;
            for (auto& patch : patches) {
-                normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
                num++;
            }

@@ -1660,9 +1704,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
    // }
    // res_imgs.push_back(res);

-    res_imgs.size = 1;
-    res_imgs.data = new clip_image_f32[res_imgs.size];
-    res_imgs.data[0] = *res;
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
    clip_image_f32_free(res);

    return true;
@@ -1676,6 +1720,9 @@ void clip_free(clip_ctx * ctx) {
    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);

+    ggml_backend_buffer_free(ctx->params_buffer);
+    ggml_backend_free(ctx->backend);
+    ggml_gallocr_free(ctx->compute_alloc);
    delete ctx;
 }

@@ -1964,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
    }
+    if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
+        return ctx->vision_model.mm_model_peg_0_b->ne[0];
+    }
    if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
        return ctx->vision_model.mm_2_b->ne[0];
    }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();

 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
-CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  & batch);
-CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

@@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

 /** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
-CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

 CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import json
+import re

 import torch
 import numpy as np
@@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b
 def get_tensor_name(name: str) -> str:
    if "projection" in name:
        return name
-
    if "mm_projector" in name:
-        return name.replace("model.mm_projector", "mm")
+        name = name.replace("model.mm_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name

    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")

@@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
 ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
-ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
+ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    clip_image_f32_batch img_res_v;
    img_res_v.size = 0;
    img_res_v.data = nullptr;
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
        delete[] img_res_v.data;
        return false;
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -29,9 +29,9 @@ struct llava_image_embed {
 };

 /** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);

-LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);

 /** build an image embed from image file bytes */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -3,3 +3,21 @@ add_executable(${TARGET} lookup.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET lookup-create)
+add_executable(${TARGET} lookup-create.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET lookup-merge)
+add_executable(${TARGET} lookup-merge.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET lookup-stats)
+add_executable(${TARGET} lookup-stats.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -0,0 +1,43 @@
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+int main(int argc, char ** argv){
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    GGML_ASSERT(model != nullptr);
+
+    // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+
+    std::vector<llama_token> inp;
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+    fprintf(stderr, "%s: tokenization done\n", __func__);
+
+
+    llama_ngram_cache ngram_cache;
+    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
+    fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
+
+    llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+}
--- a/examples/lookup/lookup-merge.cpp
+++ b/examples/lookup/lookup-merge.cpp
@@ -0,0 +1,47 @@
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+static void print_usage() {
+    fprintf(stderr, "Merges multiple lookup cache files into a single one.\n");
+    fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n");
+}
+
+int main(int argc, char ** argv){
+    if (argc < 3) {
+        print_usage();
+        exit(1);
+    }
+
+    std::vector<std::string> args;
+    args.resize(argc-1);
+    for (int i = 0; i < argc-1; ++i) {
+        args[i] = argv[i+1];
+        if (args[i] == "-h" || args[i] == "--help") {
+            print_usage();
+            exit(0);
+        }
+    }
+
+    fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
+    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
+
+    for (size_t i = 1; i < args.size()-1; ++i) {
+        fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
+        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
+
+        llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
+    }
+
+    fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
+    llama_ngram_cache_save(ngram_cache_merged, args.back());
+}
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -0,0 +1,163 @@
+#include "ggml.h"
+#include "common.h"
+#include "llama.h"
+#include "log.h"
+#include "ngram-cache.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+int main(int argc, char ** argv){
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+
+    const int n_draft = params.n_draft;
+
+    // init llama.cpp
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_set_rng_seed(ctx, params.seed);
+    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
+
+    // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+    LOG("add_bos tgt: %d\n", add_bos);
+
+    std::vector<llama_token> inp;
+    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+
+    llama_ngram_cache ngram_cache_context;
+    llama_ngram_cache ngram_cache_dynamic;
+    llama_ngram_cache ngram_cache_static;
+    int64_t t_draft_flat_us = 0;
+    int64_t t_draft_us = 0;
+
+    {
+        const int64_t t_start_draft_us = ggml_time_us();
+
+        if (!params.lookup_cache_static.empty()) {
+            try {
+                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
+            } catch (std::ifstream::failure const &) {
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                exit(1);
+            }
+        }
+
+        if (!params.lookup_cache_dynamic.empty()) {
+            try {
+                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
+            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+        }
+
+        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
+    }
+
+    const int n_input = inp.size();
+    const int n_ctx = params.n_ctx;
+
+    int n_drafted = 0;
+    int n_accept  = 0;
+
+    const int64_t t_start_ms = ggml_time_ms();
+
+    // Iterate over input tokens in chunks of size n_ctx.
+    // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility.
+    for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) {
+        const std::vector<llama_token> inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx);
+        std::vector<llama_token> pseudo_output;
+        pseudo_output.push_back(inp_slice[0]);
+
+        while ((int) pseudo_output.size() < n_ctx) {
+            // Simulate drafting and decoding from draft:
+            std::vector<llama_token> draft;
+            draft.push_back(pseudo_output.back());
+
+            {
+                const int64_t t_start_draft_us = ggml_time_us();
+                llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+                t_draft_us += ggml_time_us() - t_start_draft_us;
+            }
+
+            n_drafted += draft.size() - 1;
+
+            for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) {
+                const llama_token ground_truth = inp_slice[pseudo_output.size()];
+                const llama_token drafted = draft[j];
+
+                if (ground_truth != drafted) {
+                    break;
+                }
+
+                ++n_accept;
+                pseudo_output.push_back(ground_truth);
+
+                {
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
+            }
+
+            // After each simulated batch decoding simulate the sampling of a single token:
+            if ((int) pseudo_output.size() < n_ctx) {
+                pseudo_output.push_back(inp_slice[pseudo_output.size()]);
+                {
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }
+            }
+
+            draft.erase(draft.begin());
+
+        }
+        if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) {
+            const int64_t t_now_ms = ggml_time_ms();
+            const int64_t eta_ms   = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start;
+            const int64_t eta_min  = eta_ms / (60*1000);
+            const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
+
+            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+        }
+
+        // After each chunk, update the dynamic ngram cache with the context ngram cache:
+        llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+        ngram_cache_context.clear();
+    }
+
+    LOG_TEE("\n");
+
+    LOG_TEE("\n");
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    fprintf(stderr, "\n\n");
+
+    return 0;
+}
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,12 +1,15 @@
-#include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "common.h"
+#include "ngram-cache.h"

 #include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <vector>
+#include <unordered_map>

 int main(int argc, char ** argv){
    gpt_params params;
@@ -15,11 +18,7 @@ int main(int argc, char ** argv){
        return 1;
    }

-    // max/min n-grams size to search for in prompt
-    const int ngram_max = 4;
-    const int ngram_min = 1;
-
-    // length of the candidate / draft sequence, if match is found
+    // max. number of additional tokens to draft if match is found
    const int n_draft = params.n_draft;

    const bool dump_kv_cache = params.dump_kv_cache;
@@ -39,6 +38,8 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_set_rng_seed(ctx, params.seed);
+    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    const bool add_bos = llama_should_add_bos_token(model);
@@ -47,6 +48,35 @@ int main(int argc, char ** argv){
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);

+    llama_ngram_cache ngram_cache_context;
+    llama_ngram_cache ngram_cache_dynamic;
+    llama_ngram_cache ngram_cache_static;
+    int64_t t_draft_flat_us = 0;
+    int64_t t_draft_us = 0;
+
+    {
+        // Fill up context ngram cache with tokens from user input:
+        const int64_t t_start_draft_us = ggml_time_us();
+        llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
+
+        if (!params.lookup_cache_static.empty()) {
+            try {
+                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
+            } catch (std::ifstream::failure const &) {
+                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                exit(1);
+            }
+        }
+
+        if (!params.lookup_cache_dynamic.empty()) {
+            try {
+                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
+            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+        }
+
+        t_draft_flat_us += ggml_time_us() - t_start_draft_us;
+    }
+
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

@@ -76,8 +106,6 @@ int main(int argc, char ** argv){
    int n_drafted = 0;
    int n_accept  = 0;

-    int64_t t_draft_us = 0;
-
    int n_past = inp.size();

    bool has_eos = false;
@@ -129,6 +157,12 @@ int main(int argc, char ** argv){
                ++n_past;
                ++i_dft;
                inp.push_back(id);
+                {
+                    // Update context ngram cache with the newly accepted token:
+                    const int64_t t_start_draft_us = ggml_time_us();
+                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                    t_draft_us += ggml_time_us() - t_start_draft_us;
+                }

                if (params.use_color) {
                    // color accepted draft token
@@ -149,6 +183,12 @@ int main(int argc, char ** argv){
            draft.clear();
            draft.push_back(id);
            inp.push_back(id);
+            {
+                // Update context ngram cache with the newly accepted token:
+                const int64_t t_start_draft_us = ggml_time_us();
+                llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
+                t_draft_us += ggml_time_us() - t_start_draft_us;
+            }
            break;
        }

@@ -163,44 +203,19 @@ int main(int argc, char ** argv){
        llama_batch_clear(batch_tgt);
        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);

-        // generate n_pred tokens through prompt lookup
-        auto prompt_lookup = [&]() -> void {
-            const int inp_size = inp.size();
-            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
-                const llama_token * ngram = &inp[inp_size - ngram_size];
-
-                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
-                    bool match = true;
-                    for (int j = 0; j < ngram_size; ++j) {
-                        if (inp[i + j] != ngram[j]) {
-                            match = false;
-                            break;
-                        }
-                    }
-
-                    if (match) {
-                        const int startIdx = i + ngram_size;
-                        const int endIdx = startIdx + n_draft;
-                        if (endIdx < inp_size) {
-                            for (int j = startIdx; j < endIdx; ++j) {
-                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
-                                draft.push_back(inp[j]);
-                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
-                                ++n_drafted;
-                            }
-                            return;
-                        }
-                    }
-                }
-            }
-            return;
-        };
-
+        // Draft already contains a single token sampled from the model:
+        GGML_ASSERT(draft.size() == 1);
+        GGML_ASSERT(draft[0] == inp.back());
        const int64_t t_start_draft_us = ggml_time_us();

-        prompt_lookup();
+        llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+
+        for (size_t i = 1; i < draft.size(); ++i) {
+            llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+        }

        t_draft_us += ggml_time_us() - t_start_draft_us;
+        n_drafted += draft.size() - 1;

        llama_decode(ctx, batch_tgt);
        ++n_past;
@@ -210,19 +225,24 @@ int main(int argc, char ** argv){

    auto t_dec_end = ggml_time_us();

+    // Update dynamic ngram cache with context ngram cache and save it to disk:
+    llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
+    llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
+
    LOG_TEE("\n\n");

    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));

    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("t_draft   = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_TEE("n_draft      = %d\n", n_draft);
+    LOG_TEE("n_predict    = %d\n", n_predict);
+    LOG_TEE("n_drafted    = %d\n", n_drafted);
+    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
            t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_TEE("n_accept     = %d\n", n_accept);
+    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);

    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx);
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b

 ### Considerations

-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.

 ### Build llama.cpp and install to C:\LlamaCPP directory

--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:

 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
@@ -315,8 +316,8 @@ These options provide extra functionality and customization when running the LLa

 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -132,7 +132,6 @@ int main(int argc, char ** argv) {
    llama_context * ctx = NULL;

    // load the target model
-    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

    // load the prompts from an external file if there are any
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -380,6 +380,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            const int batch_size  = std::min(end - batch_start, n_batch);

            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
@@ -552,6 +553,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

+            int n_outputs = 0;
+
            batch.n_tokens = 0;
            for (int seq = 0; seq < n_seq_batch; seq++) {
                int seq_start = batch_start + seq*n_ctx;
@@ -566,11 +569,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

                for (int k = 0; k < batch_size; ++k) {
                    const int idx = seq*n_ctx + k;
-                    batch.token[idx] = tokens[seq_start + k];
-                    batch.pos[idx] = j*n_batch + k;
-                    batch.n_seq_id[idx] = 1;
-                    batch.seq_id[idx][0] = seq;
-                    batch.logits[idx] = batch.pos[idx] >= first ? 1 : 0;
+                    batch.token   [idx]    = tokens[seq_start + k];
+                    batch.pos     [idx]    = j*n_batch + k;
+                    batch.n_seq_id[idx]    = 1;
+                    batch.seq_id  [idx][0] = seq;
+                    batch.logits  [idx]    = batch.pos[idx] >= first ? 1 : 0;
+
+                    n_outputs += batch.logits[idx] != 0;
                }
                batch.n_tokens += batch_size;

@@ -583,9 +588,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                return {tokens, -1, logit_history, prob_history};
            }

-            if (num_batches > 1) {
+            if (num_batches > 1 && n_outputs > 0) {
                const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+                logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
            }
        }

@@ -604,14 +609,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        }

        for (int seq = 0; seq < n_seq_batch; seq++) {
-            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
+            const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
+
            llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
            if (!params.logits_file.empty()) {
-                process_logits(logits_stream, n_vocab, all_logits + first*n_vocab,
+                process_logits(logits_stream, n_vocab, all_logits,
                        tokens_data, n_ctx - 1 - first,
                        workers, log_probs, nll, nll2);
            } else {
-                process_logits(n_vocab, all_logits + first*n_vocab,
+                process_logits(n_vocab, all_logits,
                        tokens_data, n_ctx - 1 - first,
                        workers, nll, nll2,
                        logit_history.data() + start + seq*n_ctx + first,
@@ -652,6 +658,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 }

 static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
+    int prev_outputs = 0;
    for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
        const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));

@@ -672,7 +679,14 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
            return false;
        }

-        memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float));
+        int n_outputs = 0;
+        for (int i = 0; i < n_tokens; ++i) {
+            n_outputs += batch_view.logits[i] != 0;
+        }
+
+        memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
+
+        prev_outputs += n_outputs;
    }

    return true;
@@ -779,7 +793,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        size_t ending_logprob_count[4];
        double ending_logprob[4];

-        size_t i_batch;         // starting index in the llama_batch
+        size_t i_logits;        // starting index of logits in the llama_batch
        size_t common_prefix;   // max number of initial tokens that are the same in all sentences
        size_t required_tokens; // needed number of tokens to evaluate all 4 endings
        std::vector<llama_token> seq_tokens[4];
@@ -844,9 +858,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    const int max_tasks_per_batch = 32;
    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(n_ctx, 0, 4);

    std::vector<float> tok_logits(n_vocab);
+    // TODO: this could be made smaller; it's currently the worst-case size
    std::vector<float> batch_logits(n_vocab*n_ctx);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
@@ -857,16 +872,17 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        int n_cur = 0;

        size_t i1 = i0;
-        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch

        llama_batch_clear(batch);

        // batch as much tasks as possible into the available context
-        // each task has 4 unique seuqnce ids - one for each ending
+        // each task has 4 unique sequence ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
            auto & hs_cur = hs_data[i1];
+            int n_logits = 0;

            const int s0 = 4*(i1 - i0);
            if (s0 + 4 > max_seq) {
@@ -874,18 +890,23 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            }

            for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
-                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
+                llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
            }
            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            n_logits += 1;

            for (int s = 0; s < 4; ++s) {
-                for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true);
+                const size_t seq_tokens_size = hs_cur.seq_tokens[s].size();
+                // TODO: don't evaluate the last token of each sequence
+                for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
+                    const bool needs_logits = i < seq_tokens_size - 1;
+                    llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    n_logits += needs_logits;
                }
            }

-            hs_cur.i_batch = i_batch;
-            i_batch += hs_cur.required_tokens;
+            hs_cur.i_logits = i_logits;
+            i_logits += n_logits;

            n_cur += hs_data[i1].required_tokens;
            if (++i1 == hs_task_count) {
@@ -911,12 +932,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        eval_pairs.clear();
        for (size_t i = i0; i < i1; ++i) {
            auto & hs_cur = hs_data[i];
-            size_t li = hs_cur.common_prefix;
+            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
            for (int s = 0; s < 4; ++s) {
                for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
+                    eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]);
                }
-                ++li;
            }
        }
        // Then we do the actual calculation
@@ -928,7 +948,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        for (size_t i = i0; i < i1; ++i) {
            auto & hs_cur = hs_data[i];

-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float));
+            // get the logits of the last token of the common prefix
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));

            const auto first_probs = softmax(tok_logits);

@@ -978,7 +999,7 @@ struct winogrande_entry {
    std::array<std::string, 2> choices;
    int answer;

-    size_t i_batch;
+    size_t i_logits;
    size_t common_prefix;
    size_t required_tokens;
    size_t n_base1; // number of tokens for context + choice 1
@@ -1104,6 +1125,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
            task.common_prefix++;
        }

+        // TODO: the last token of each of the sequences don't need to be evaluated
        task.required_tokens = task.common_prefix +
            task.seq_tokens[0].size() - task.common_prefix +
            task.seq_tokens[1].size() - task.common_prefix;
@@ -1121,9 +1143,10 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    const int max_tasks_per_batch = 128;
    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(n_ctx, 0, 2);

    std::vector<float> tok_logits(n_vocab);
+    // TODO: this could be made smaller; it's currently the worst-case size
    std::vector<float> batch_logits(n_vocab*n_ctx);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
@@ -1137,29 +1160,33 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
        int n_cur = 0;

        size_t i1 = i0;
-        size_t i_batch = 0;
+        size_t i_logits = 0;

        llama_batch_clear(batch);

        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
+            int n_logits = 0;
            const int s0 = 2*(i1 - i0);
            if (s0 + 2 > max_seq) {
                break;
            }

            for (size_t i = 0; i < data[i1].common_prefix; ++i) {
-                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false);
+                llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
            }
            batch.logits[batch.n_tokens - 1] = true;
+            n_logits += 1;

            for (int s = 0; s < 2; ++s) {
+                // TODO: end before the last token, no need to predict past the end of the sequences
                for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
                    llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                    n_logits += 1;
                }
            }

-            data[i1].i_batch = i_batch;
-            i_batch += data[i1].required_tokens;
+            data[i1].i_logits = i_logits;
+            i_logits += n_logits;

            n_cur += data[i1].required_tokens;
            if (++i1 == data.size()) {
@@ -1190,15 +1217,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

            const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix;
            const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
-            size_t li = n_base1 - 1;
+            size_t li = n_base1 - task.common_prefix;
            for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
-                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
+                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]);
            }
            const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
            const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
-            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
+            // FIXME: this uses the wrong first logits when not skipping the choice word
+            li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix;
            for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
-                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
+                eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]);
            }
        }
        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
@@ -1287,7 +1315,7 @@ struct multiple_choice_task {
    }

    // For evaluation
-    size_t i_batch;         // starting index in the llama_batch
+    size_t i_logits;        // starting index of logits in the llama_batch
    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
    size_t required_tokens; // needed number of tokens to evaluate all answers
    std::vector<std::vector<llama_token>> seq_tokens;
@@ -1366,7 +1394,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    std::vector<uint32_t> task_pos(n_task);
    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
    if (strstream.fail()) {
-        printf("%s: failed to raad task positions from prompt\n", __func__);
+        printf("%s: failed to read task positions from prompt\n", __func__);
        return;
    }

@@ -1447,7 +1475,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            return;
        }
    } else {
-        int n_dot = n_task/100;
+        int n_dot = std::max((int) n_task/100, 1);
        int i_task = 0;
        for (auto& task : tasks) {
            ++i_task;
@@ -1491,17 +1519,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        int n_cur = 0;

        size_t i1 = i0;
-        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
+        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch

        llama_batch_clear(batch);

        // batch as much tasks as possible into the available context
-        // each task has 4 unique seuqnce ids - one for each ending
+        // each task has 4 unique sequence ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        int s0 = 0;
        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
            auto& cur_task = tasks[i1];
+            int n_logits = 0;

            int num_answers = cur_task.seq_tokens.size();
            if (s0 + num_answers > max_seq) {
@@ -1518,17 +1547,22 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
            }
            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            n_logits += 1;

            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
+                const size_t seq_tokens_size = cur_task.seq_tokens[s].size();
+                // TODO: don't evaluate the last token of each sequence
+                for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
+                    const bool needs_logits = i < seq_tokens_size - 1;
+                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    n_logits += needs_logits;
                }
            }

            s0 += num_answers;

-            cur_task.i_batch = i_batch;
-            i_batch += cur_task.required_tokens;
+            cur_task.i_logits = i_logits;
+            i_logits += n_logits;

            n_cur += cur_task.required_tokens;
            if (++i1 == tasks.size()) {
@@ -1554,12 +1588,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        eval_pairs.clear();
        for (size_t i = i0; i < i1; ++i) {
            auto& cur_task = tasks[i];
-            size_t li = cur_task.common_prefix;
+            size_t li = 1; // skip the last logit of the common prefix (computed separately below)
            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
+                    eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]);
                }
-                ++li;
            }
        }
        // Then we do the actual calculation
@@ -1578,7 +1611,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
            //}
            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);

-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
+            // get the logits of the last token of the common prefix
+            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));

            const auto first_probs = softmax(tok_logits);

@@ -1730,6 +1764,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

+            // TODO: use llama_batch.logits instead of relying on logits_all == true
            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "IQ2_S",  LLAMA_FTYPE_MOSTLY_IQ2_S,  " 2.5  bpw quantization",            },
    { "IQ2_M",  LLAMA_FTYPE_MOSTLY_IQ2_M,  " 2.7  bpw quantization",            },
    { "IQ1_S",  LLAMA_FTYPE_MOSTLY_IQ1_S,  " 1.56 bpw quantization",            },
+    { "IQ1_M",  LLAMA_FTYPE_MOSTLY_IQ1_M,  " 1.75 bpw quantization",            },
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization",            },
@@ -87,13 +88,17 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
    printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
    printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
+    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --override-kv KEY=TYPE:VALUE\n");
+    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
    printf("Note: --include-weights and --exclude-weights cannot be used together\n");
    printf("\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
@@ -107,14 +112,14 @@ static void usage(const char * executable) {
    exit(1);
 }

-static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
+        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
        return;
    }
    int n_entries;
-    in.read((char*)&n_entries, sizeof(n_entries));
+    in.read((char *)&n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
        printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
        return;
@@ -124,25 +129,25 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std
        std::vector<char> name_as_vec(len+1);
        in.read((char *)name_as_vec.data(), len);
        if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
+            printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
            return;
        }
        name_as_vec[len] = 0;
        std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = imatrix_data[std::move(name)];
        int ncall;
-        in.read((char*)&ncall, sizeof(ncall));
+        in.read((char *)&ncall, sizeof(ncall));
        int nval;
        in.read((char *)&nval, sizeof(nval));
        if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            printf("%s: failed reading number of values for entry %d\n", __func__, i);
            imatrix_data = {};
            return;
        }
        e.resize(nval);
-        in.read((char*)e.data(), nval*sizeof(float));
+        in.read((char *)e.data(), nval*sizeof(float));
        if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n",__func__,i);
+            printf("%s: failed reading data for entry %d\n", __func__, i);
            imatrix_data = {};
            return;
        }
@@ -150,13 +155,13 @@ static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std
            for (auto& v : e) v /= ncall;
        }
    }
-    printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
+    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
 }

-static void prepare_imatrix(const std::string& imatrix_file,
-        const std::vector<std::string>& included_weights,
-        const std::vector<std::string>& excluded_weights,
-        std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+static void prepare_imatrix(const std::string & imatrix_file,
+        const std::vector<std::string> & included_weights,
+        const std::vector<std::string> & excluded_weights,
+        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    if (!imatrix_file.empty()) {
        load_imatrix(imatrix_file, imatrix_data);
    }
@@ -189,6 +194,55 @@ static void prepare_imatrix(const std::string& imatrix_file,
    }
 }

+static ggml_type parse_ggml_type(const char * arg) {
+    ggml_type result = GGML_TYPE_COUNT;
+    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
+        auto type = ggml_type(j);
+        const auto * name = ggml_type_name(type);
+        if (name && strcmp(arg, name) == 0) {
+            result = type; break;
+        }
+    }
+    return result;
+}
+
+static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+    const char* sep = strchr(data, '=');
+    if (sep == nullptr || sep - data >= 128) {
+        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        return false;
+    }
+    llama_model_kv_override kvo;
+    std::strncpy(kvo.key, data, sep - data);
+    kvo.key[sep - data] = 0;
+    sep++;
+    if (strncmp(sep, "int:", 4) == 0) {
+        sep += 4;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+        kvo.int_value = std::atol(sep);
+    } else if (strncmp(sep, "float:", 6) == 0) {
+        sep += 6;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+        kvo.float_value = std::atof(sep);
+    } else if (strncmp(sep, "bool:", 5) == 0) {
+        sep += 5;
+        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+        if (std::strcmp(sep, "true") == 0) {
+            kvo.bool_value = true;
+        } else if (std::strcmp(sep, "false") == 0) {
+            kvo.bool_value = false;
+        } else {
+            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            return false;
+        }
+    } else {
+        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        return false;
+    }
+    overrides.emplace_back(std::move(kvo));
+    return true;
+}
+
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@@ -199,10 +253,27 @@ int main(int argc, char ** argv) {
    int arg_idx = 1;
    std::string imatrix_file;
    std::vector<std::string> included_weights, excluded_weights;
+    std::vector<llama_model_kv_override> kv_overrides;

    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
            params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
+            if (arg_idx < argc-1) {
+                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
+            } else {
+                usage(argv[0]);
+            }
+        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
+            if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
+                usage(argv[0]);
+            }
        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
            params.allow_requantize = true;
        } else if (strcmp(argv[arg_idx], "--pure") == 0) {
@@ -243,6 +314,11 @@ int main(int argc, char ** argv) {
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
    }
+    if (!kv_overrides.empty()) {
+        kv_overrides.emplace_back();
+        kv_overrides.back().key[0] = 0;
+        params.kv_overrides = &kv_overrides;
+    }

    llama_backend_init();

@@ -264,8 +340,7 @@ int main(int argc, char ** argv) {
        if (ftype_str == "COPY") {
            params.only_copy = true;
        }
-    }
-    else {
+    } else {
        fname_out = argv[arg_idx];
        arg_idx++;

@@ -296,10 +371,12 @@ int main(int argc, char ** argv) {

    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) {
-        fprintf(stderr, "\n===============================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "===============================================================================================\n\n\n");
+         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+        fprintf(stderr, "\n==========================================================================================================\n");
+        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
+        fprintf(stderr, "==========================================================================================================\n\n\n");
        return 1;
    }

--- a/examples/regex-to-grammar.py
+++ b/examples/regex-to-grammar.py
@@ -0,0 +1,20 @@
+import json, subprocess, sys, os
+
+assert len(sys.argv) >= 2
+[_, pattern, *rest] = sys.argv
+
+print(subprocess.check_output(
+    [
+        "python",
+        os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "json-schema-to-grammar.py"),
+        *rest,
+        "-",
+        "--raw-pattern",
+    ],
+    text=True,
+    input=json.dumps({
+        "type": "string",
+        "pattern": pattern,
+    }, indent=2)))
--- a/examples/retrieval/CMakeLists.txt
+++ b/examples/retrieval/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET retrieval)
+add_executable(${TARGET} retrieval.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@@ -0,0 +1,69 @@
+# llama.cpp/examples/retrieval
+
+Demonstration of simple retrieval technique based on cosine similarity
+
+More info:
+https://github.com/ggerganov/llama.cpp/pull/6193
+
+### How to use
+
+`retieval.cpp` has parameters of its own:
+- `--context-file`: file to be embedded - state this option multiple times to embed multiple files
+- `--chunk-size`: minimum size of each text chunk to be embedded
+- `--chunk-separator`: STRING to divide chunks by. newline by default
+
+`retrieval` example can be tested as follows:
+
+```bash
+make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .
+```
+
+This chunks and embeds all given files and starts a loop requesting query inputs:
+
+```
+Enter query:
+```
+
+On each query input, top k chunks are shown along with file name, chunk position within file and original text:
+
+```
+Enter query: describe the mit license
+batch_decode: n_tokens = 6, n_seq = 1
+Top 3 similar chunks:
+filename: README.md
+filepos: 119
+similarity: 0.762334
+textdata:
+png)
+
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+
+[Roadmap](https://github.
+--------------------
+filename: License
+filepos: 0
+similarity: 0.725146
+textdata:
+MIT License
+
+Copyright (c) 2023 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+--------------------
+filename: README.md
+filepos: 9178
+similarity: 0.621722
+textdata:
+com/cztomsik/ava) (MIT)
+- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
+- [pythops/tenere](https://github.
+--------------------
+```
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -0,0 +1,350 @@
+#include "common.h"
+#include "llama.h"
+
+#include <algorithm>
+#include <fstream>
+
+struct retrieval_params {
+    std::vector<std::string> context_files; // context files to embed
+    int32_t chunk_size            = 64;     // chunk size for context embedding
+    std::string chunk_separator   = "\n";   // chunk separator for context embedding
+};
+
+static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
+    gpt_print_usage(argc, argv, gpt_params);
+    printf("retrieval options:\n");
+    printf("  --context-file FNAME  file containing context to embed.\n");
+    printf("                        specify multiple files by providing --context-file option multiple times.\n");
+    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
+    printf("  --chunk-separator STRING\n");
+    printf("                        string to separate chunks (default: \"\\n\")\n");
+    printf("\n");
+}
+
+static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
+    int i = 1;
+    std::string arg;
+    while (i < argc) {
+        arg = argv[i];
+        bool invalid_gpt_param = false;
+        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
+            if (invalid_gpt_param) {
+                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            // option was parsed by gpt_params_find_arg
+        } else if (arg == "--context-file") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --context-file\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            // store the external file name in params
+            retrieval_params.context_files.push_back(argv[i]);
+        } else if (arg == "--chunk-size") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --chunk-size\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            retrieval_params.chunk_size = std::stoi(argv[i]);
+        } else if (arg == "--chunk-separator") {
+            if (++i >= argc) {
+                fprintf(stderr, "error: missing argument for --chunk-separator\n");
+                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+                exit(1);
+            }
+            retrieval_params.chunk_separator = argv[i];
+        } else {
+            // unknown argument
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
+            exit(1);
+        }
+        i++;
+    }
+}
+
+struct chunk {
+    // filename
+    std::string filename;
+    // original file position
+    size_t filepos;
+    // original text data
+    std::string textdata = "";
+    // tokenized text data
+    std::vector<llama_token> tokens;
+    // embedding
+    std::vector<float> embedding;
+};
+
+// chunk file data to chunks of size >= chunk_size
+// chunk_separator is the separator between chunks
+static std::vector<chunk> chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) {
+    std::vector<chunk> chunks;
+    std::ifstream f(filename.c_str());
+
+    if (!f.is_open()) {
+        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        return chunks;
+    }
+
+    chunk current_chunk;
+    char buffer[1024];
+    int64_t filepos = 0;
+    std::string current = "";
+    while (f.read(buffer, 1024)) {
+        current += std::string(buffer, f.gcount());
+        size_t pos;
+        while ((pos = current.find(chunk_separator)) != std::string::npos) {
+            current_chunk.textdata += current.substr(0, pos + chunk_separator.size());
+            if ((int) current_chunk.textdata.size() > chunk_size) {
+                // save chunk
+                current_chunk.filepos = filepos;
+                current_chunk.filename = filename;
+                chunks.push_back(current_chunk);
+                // update filepos
+                filepos += (int) current_chunk.textdata.size();
+                // reset current_chunk
+                current_chunk = chunk();
+            }
+            current = current.substr(pos + chunk_separator.size());
+        }
+
+    }
+    // add leftover data to last chunk
+    if (current_chunk.textdata.size() > 0) {
+        if (chunks.empty()) {
+            current_chunk.filepos = filepos;
+            current_chunk.filename = filename;
+            chunks.push_back(current_chunk);
+        } else {
+            chunks.back().textdata += current_chunk.textdata;
+        }
+    }
+    f.close();
+    return chunks;
+}
+
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
+    for (size_t i = 0; i < tokens.size(); i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+    }
+}
+
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+    // clear previous kv_cache values (irrelevant for embeddings)
+    llama_kv_cache_clear(ctx);
+
+    // run model
+    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_decode(ctx, batch) < 0) {
+        fprintf(stderr, "%s : failed to decode\n", __func__);
+    }
+
+    for (int i = 0; i < batch.n_tokens; i++) {
+        if (!batch.logits[i]) {
+            continue;
+        }
+
+        // try to get sequence embeddings - supported only when pooling_type is not NONE
+        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        if (embd == NULL) {
+            embd = llama_get_embeddings_ith(ctx, i);
+            if (embd == NULL) {
+                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                continue;
+            }
+        }
+
+        float * out = output + batch.seq_id[i][0] * n_embd;
+        llama_embd_normalize(embd, out, n_embd);
+    }
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    retrieval_params retrieval_params;
+
+    retrieval_params_parse(argc, argv, params, retrieval_params);
+
+    // For BERT models, batch size must be equal to ubatch size
+    params.n_ubatch = params.n_batch;
+
+    if (retrieval_params.chunk_size <= 0) {
+        fprintf(stderr, "chunk_size must be positive\n");
+        return 1;
+    }
+    if (retrieval_params.context_files.empty()) {
+        fprintf(stderr, "context_files must be specified\n");
+        return 1;
+    }
+    params.embedding = true;
+
+    print_build_info();
+
+    printf("processing files:\n");
+    for (auto & context_file : retrieval_params.context_files) {
+        printf("%s\n", context_file.c_str());
+    }
+
+    std::vector<chunk> chunks;
+    for (auto & context_file : retrieval_params.context_files) {
+        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+        chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
+    }
+    printf("Number of chunks: %ld\n", chunks.size());
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        return 1;
+    }
+
+    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx = llama_n_ctx(ctx);
+
+    if (n_ctx > n_ctx_train) {
+        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+                __func__, n_ctx_train, n_ctx);
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+    }
+
+    // max batch size
+    const uint64_t n_batch = params.n_batch;
+    GGML_ASSERT(params.n_batch >= params.n_ctx);
+
+    // tokenize the prompts and trim
+    for (auto & chunk : chunks) {
+        auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
+        if (inp.size() > n_batch) {
+            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+                    __func__, (long long int) inp.size(), (long long int) n_batch);
+            return 1;
+        }
+        // add eos if not present
+        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+            inp.push_back(llama_token_eos(model));
+        }
+        chunk.tokens = inp;
+    }
+
+    // tokenization stats
+    if (params.verbose_prompt) {
+        for (int i = 0; i < (int) chunks.size(); i++) {
+            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
+                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+            }
+            fprintf(stderr, "\n\n");
+        }
+    }
+
+    // initialize batch
+    const int n_chunks = chunks.size();
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
+    // allocate output
+    const int n_embd = llama_n_embd(model);
+    std::vector<float> embeddings(n_chunks * n_embd, 0);
+    float * emb = embeddings.data();
+
+    // break into batches
+    int p = 0; // number of prompts processed already
+    int s = 0; // number of prompts in current batch
+    for (int k = 0; k < n_chunks; k++) {
+        // clamp to n_batch tokens
+        auto & inp = chunks[k].tokens;
+
+        const uint64_t n_toks = inp.size();
+
+        // encode if at capacity
+        if (batch.n_tokens + n_toks > n_batch) {
+            float * out = emb + p * n_embd;
+            batch_decode(ctx, batch, out, s, n_embd);
+            llama_batch_clear(batch);
+            p += s;
+            s = 0;
+        }
+
+        // add to batch
+        batch_add_seq(batch, inp, s);
+        s += 1;
+    }
+
+    // final batch
+    float * out = emb + p * n_embd;
+    batch_decode(ctx, batch, out, s, n_embd);
+
+    // save embeddings to chunks
+    for (int i = 0; i < n_chunks; i++) {
+        chunks[i].embedding = std::vector<float>(emb + i * n_embd, emb + (i + 1) * n_embd);
+        // clear tokens as they are no longer needed
+        chunks[i].tokens.clear();
+    }
+
+    // start loop, receive query and return top k similar chunks based on cosine similarity
+    std::string query;
+    while (true) {
+        printf("Enter query: ");
+        std::getline(std::cin, query);
+        std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
+
+        struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+        batch_add_seq(query_batch, query_tokens, 0);
+
+        std::vector<float> query_emb(n_embd, 0);
+        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
+
+        llama_batch_clear(query_batch);
+
+        // compute cosine similarities
+        {
+            std::vector<std::pair<int, float>> similarities;
+            for (int i = 0; i < n_chunks; i++) {
+                float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
+                similarities.push_back(std::make_pair(i, sim));
+            }
+
+            // sort similarities
+            std::sort(similarities.begin(), similarities.end(), [](const std::pair<int, float> & a, const std::pair<int, float> & b) {
+                return a.second > b.second;
+            });
+
+            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                printf("similarity: %f\n", similarities[i].second);
+                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                printf("--------------------\n");
+            }
+        }
+    }
+
+    // clean up
+    llama_print_timings(ctx);
+    llama_free(ctx);
+    llama_free_model(model);
+    llama_backend_free();
+}
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -2,12 +2,16 @@ set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET}
+    server.cpp
+    utils.hpp
+    httplib.h
+)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT})
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -16,16 +16,20 @@ The project is under active development, and we are [looking for feedback and co

 **Command line options:**

- `--threads N`, `-t N`: Set the number of threads to use during generation.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching, this parameter is used only if one token is to be processed on CPU backend.
+- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (default: unused).
+- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository (default: unused).
+- `-hff FILE, --hf-file FILE`: Hugging Face model file (default: unused).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
+- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used.
+- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
+- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048`.
+- `-ub N`, `--ubatch-size N`: physical maximum batch size. Default: `512`.
 - `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
@@ -56,7 +60,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled)
 - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, default: enabled.
+- `--log-disable`: Output logs to stdout only, not to `llama.log`. default: enabled.
 - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json)

 **If compiled with `LLAMA_SERVER_SSL=ON`**
@@ -259,7 +263,7 @@ node index.js

    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.

-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)

    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false)

@@ -356,7 +360,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)

- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint.
+- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used.

    *Options:*

--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -26,8 +26,9 @@ const propOrder = grammarJsonSchemaPropOrder

 let grammar = null
 if (grammarJsonSchemaFile) {
-    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
-    const converter = new SchemaConverter(propOrder)
+    let schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter({prop_order: propOrder, allow_fetch: true})
+    schema = await converter.resolveRefs(schema, grammarJsonSchemaFile)
    converter.visit(schema, '')
    grammar = converter.formatGrammar()
 }
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -483,4 +483,4 @@ unsigned char completion_js[] = {
  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 };
-unsigned int completion_js_len = 5796;
+size_t completion_js_len = 5796;
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -630,14 +630,16 @@

      const grammarJsonSchemaPropOrder = signal('')
      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
-      const convertJSONSchemaGrammar = () => {
+      const convertJSONSchemaGrammar = async () => {
        try {
-          const schema = JSON.parse(params.value.grammar)
-          const converter = new SchemaConverter(
-            grammarJsonSchemaPropOrder.value
+          let schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter({
+            prop_order: grammarJsonSchemaPropOrder.value
              .split(',')
-              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
-          )
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {}),
+            allow_fetch: true,
+          })
+          schema = await converter.resolveRefs(schema, 'input')
          converter.visit(schema, '')
          params.value = {
            ...params.value,
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -1,112 +1,538 @@
+// WARNING: This file was ported from json-schema-to-grammar.py, please fix bugs / add features there first.
 const SPACE_RULE = '" "?';

 const PRIMITIVE_RULES = {
  boolean: '("true" | "false") space',
  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  value: 'object | array | string | number | boolean',
+  object: '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space',
+  array: '"[" space ( value ("," space value)* )? "]" space',
+  uuid: '"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space',
  string: ` "\\"" (
        [^"\\\\] |
        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\\"" space`,
  null: '"null" space',
 };
+const OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'];
+
+// TODO: support "uri", "email" string formats
+const DATE_RULES = {
+    'date'   : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )',
+    'time'   : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )',
+    'date-time': 'date "T" time',
+    'date-string': '"\\"" date "\\"" space',
+    'time-string': '"\\"" time "\\"" space',
+    'date-time-string': '"\\"" date-time "\\"" space',
+};
+
+const RESERVED_NAMES = {'root': true, ...PRIMITIVE_RULES, ...DATE_RULES};

 const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
 const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
-const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
+const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
+const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
+
+const NON_LITERAL_SET = new Set('|.()[]{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');

 export class SchemaConverter {
-  constructor(propOrder) {
-    this._propOrder = propOrder || {};
-    this._rules = new Map();
-    this._rules.set('space', SPACE_RULE);
+  constructor(options) {
+    this._propOrder = options.prop_order || {};
+    this._allowFetch = options.allow_fetch || false;
+    this._dotall = options.dotall || false;
+    this._rules = {'space': SPACE_RULE};
+    this._refs = {};
+    this._refsBeingResolved = new Set();
  }

  _formatLiteral(literal) {
-    const escaped = JSON.stringify(literal).replace(
+    const escaped = literal.replace(
      GRAMMAR_LITERAL_ESCAPE_RE,
      m => GRAMMAR_LITERAL_ESCAPES[m]
    );
    return `"${escaped}"`;
  }

+  _formatRangeChar(literal) {
+    return JSON.stringify(literal).slice(1, -1).replace(
+      GRAMMAR_RANGE_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+  }
+
  _addRule(name, rule) {
    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
    let key = escName;

-    if (this._rules.has(escName)) {
-      if (this._rules.get(escName) === rule) {
+    if (escName in this._rules) {
+      if (this._rules[escName] === rule) {
        return key;
      }

      let i = 0;
-      while (this._rules.has(`${escName}${i}`)) {
+      while ((`${escName}${i}` in this._rules) && (this._rules[`${escName}${i}`] !== rule)) {
        i += 1;
      }
      key = `${escName}${i}`;
    }

-    this._rules.set(key, rule);
+    this._rules[key] = rule;
    return key;
  }

+  async resolveRefs(schema, url) {
+    const visit = async (n) => {
+      if (Array.isArray(n)) {
+        return Promise.all(n.map(visit));
+      } else if (typeof n === 'object' && n !== null) {
+        let ref = n.$ref;
+        let target;
+        if (ref !== undefined && !this._refs[ref]) {
+          if (ref.startsWith('https://')) {
+            if (!this._allowFetch) {
+              throw new Error('Fetching remote schemas is not allowed (use --allow-fetch for force)');
+            }
+            const fetch = (await import('node-fetch')).default;
+
+            const fragSplit = ref.split('#');
+            const baseUrl = fragSplit[0];
+
+            target = this._refs[baseUrl];
+            if (!target) {
+              target = await this.resolveRefs(await fetch(ref).then(res => res.json()), baseUrl);
+              this._refs[baseUrl] = target;
+            }
+
+            if (fragSplit.length === 1 || fragSplit[fragSplit.length - 1] === '') {
+              return target;
+            }
+          } else if (ref.startsWith('#/')) {
+            target = schema;
+            ref = `${url}${ref}`;
+            n.$ref = ref;
+          } else {
+            throw new Error(`Unsupported ref ${ref}`);
+          }
+
+          const selectors = ref.split('#')[1].split('/').slice(1);
+          for (const sel of selectors) {
+            if (!target || !(sel in target)) {
+              throw new Error(`Error resolving ref ${ref}: ${sel} not in ${JSON.stringify(target)}`);
+            }
+            target = target[sel];
+          }
+
+          this._refs[ref] = target;
+        } else {
+          await Promise.all(Object.values(n).map(visit));
+        }
+      }
+
+      return n;
+    };
+
+    return visit(schema);
+  }
+
+  _generateUnionRule(name, altSchemas) {
+    return altSchemas
+      .map((altSchema, i) => this.visit(altSchema, `${name ?? ''}${name ? '-' : 'alternative-'}${i}`))
+      .join(' | ');
+  }
+
+  _visitPattern(pattern, name) {
+    if (!pattern.startsWith('^') || !pattern.endsWith('$')) {
+      throw new Error('Pattern must start with "^" and end with "$"');
+    }
+    pattern = pattern.slice(1, -1);
+    const subRuleIds = {};
+
+    let i = 0;
+    const length = pattern.length;
+
+    const getDot = () => {
+      let rule;
+      if (this._dotall) {
+        rule = '[\\U00000000-\\U0010FFFF]';
+      } else {
+        // Accept any character... except \n and \r line break chars (\x0A and \xOD)
+        rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]';
+      }
+      return this._addRule('dot', rule);
+    };
+
+
+    const toRule = ([s, isLiteral]) => isLiteral ? "\"" + s + "\"" : s;
+
+    const transform = () => {
+      const start = i;
+      // For each component of this sequence, store its string representation and whether it's a literal.
+      // We only need a flat structure here to apply repetition operators to the last item, and
+      // to merge literals at the and (we're parsing grouped ( sequences ) recursively and don't treat '|' specially
+      // (GBNF's syntax is luckily very close to regular expressions!)
+      const seq = [];
+
+      const joinSeq = () => {
+        const ret = [];
+        for (const [isLiteral, g] of groupBy(seq, x => x[1])) {
+          if (isLiteral) {
+            ret.push([[...g].map(x => x[0]).join(''), true]);
+          } else {
+            ret.push(...g);
+          }
+        }
+        if (ret.length === 1) {
+          return ret[0];
+        }
+        return [ret.map(x => toRule(x)).join(' '), false];
+      };
+
+      while (i < length) {
+        const c = pattern[i];
+        if (c === '.') {
+          seq.push([getDot(), false]);
+          i += 1;
+        } else if (c === '(') {
+          i += 1;
+          if (i < length) {
+            if (pattern[i] === '?') {
+              throw new Error(`Unsupported pattern syntax "${pattern[i]}" at index ${i} of /${pattern}/`);
+            }
+          }
+          seq.push([`(${toRule(transform())})`, false]);
+        } else if (c === ')') {
+          i += 1;
+          if (start <= 0 || pattern[start - 1] !== '(') {
+            throw new Error(`Unbalanced parentheses; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          return joinSeq();
+        } else if (c === '[') {
+          let squareBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== ']') {
+            if (pattern[i] === '\\') {
+              squareBrackets += pattern.slice(i, i + 2);
+              i += 2;
+            } else {
+              squareBrackets += pattern[i];
+              i += 1;
+            }
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced square brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          squareBrackets += ']';
+          i += 1;
+          seq.push([squareBrackets, false]);
+        } else if (c === '|') {
+          seq.push(['|', false]);
+          i += 1;
+        } else if (c === '*' || c === '+' || c === '?') {
+          seq[seq.length - 1] = [toRule(seq[seq.length - 1]) + c, false];
+          i += 1;
+        } else if (c === '{') {
+          let curlyBrackets = c;
+          i += 1;
+          while (i < length && pattern[i] !== '}') {
+            curlyBrackets += pattern[i];
+            i += 1;
+          }
+          if (i >= length) {
+            throw new Error(`Unbalanced curly brackets; start = ${start}, i = ${i}, pattern = ${pattern}`);
+          }
+          curlyBrackets += '}';
+          i += 1;
+          const nums = curlyBrackets.slice(1, -1).split(',').map(s => s.trim());
+          let minTimes, maxTimes;
+          if (nums.length === 1) {
+            minTimes = parseInt(nums[0], 10);
+            maxTimes = minTimes;
+          } else {
+            if (nums.length !== 2) {
+              throw new Error(`Invalid quantifier ${curlyBrackets}`);
+            }
+            minTimes = nums[0] ? parseInt(nums[0], 10) : 0;
+            maxTimes = nums[1] ? parseInt(nums[1], 10) : Infinity;
+          }
+
+          let [sub, subIsLiteral] = seq[seq.length - 1];
+
+          if (minTimes === 0 && maxTimes === Infinity) {
+            seq[seq.length - 1] = [`${sub}*`, false];
+          } else if (minTimes === 0 && maxTimes === 1) {
+            seq[seq.length - 1] = [`${sub}?`, false];
+          } else if (minTimes === 1 && maxTimes === Infinity) {
+            seq[seq.length - 1] = [`${sub}+`, false];
+          } else {
+            if (!subIsLiteral) {
+              let id = subRuleIds[sub];
+              if (id === undefined) {
+                id = this._addRule(`${name}-${Object.keys(subRuleIds).length + 1}`, sub);
+                subRuleIds[sub] = id;
+              }
+              sub = id;
+            }
+
+            const repeatedSub = Array.from({ length: minTimes }, () => subIsLiteral ? `"${sub.slice(1, -1).repeat(minTimes)}"` : sub);
+            const optionalSub = maxTimes !== undefined ? Array.from({ length: maxTimes - minTimes }, () => `${sub}?`) : [`${sub}*`];
+            seq[seq.length - 1] = [repeatedSub.concat(optionalSub).join(' '), false];
+          }
+        } else {
+          let literal = '';
+          while (i < length) {
+            if (pattern[i] === '\\' && i < length - 1) {
+              const next = pattern[i + 1];
+              if (ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS.has(next)) {
+                i += 1;
+                literal += pattern[i];
+                i += 1;
+              } else {
+                literal += pattern.slice(i, i + 2);
+                i += 2;
+              }
+            } else if (pattern[i] === '"') {
+              literal += '\\"';
+              i += 1;
+            } else if (!NON_LITERAL_SET.has(pattern[i]) &&
+                (i === length - 1 || literal === '' || pattern[i + 1] === '.' || !NON_LITERAL_SET.has(pattern[i+1]))) {
+              literal += pattern[i];
+              i += 1;
+            } else {
+              break;
+            }
+          }
+          if (literal !== '') {
+            seq.push([literal, true]);
+          }
+        }
+      }
+
+      return joinSeq();
+    };
+
+    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
+  }
+
+  _resolveRef(ref) {
+    let refName = ref.split('/').pop();
+    if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
+      this._refsBeingResolved.add(ref);
+      const resolved = this._refs[ref];
+      refName = this.visit(resolved, refName);
+      this._refsBeingResolved.delete(ref);
+    }
+    return refName;
+  }
+
+  _generateConstantRule(value) {
+    return this._formatLiteral(JSON.stringify(value));
+  }
+
  visit(schema, name) {
    const schemaType = schema.type;
-    const ruleName = name || 'root';
+    const schemaFormat = schema.format;
+    const ruleName = name in RESERVED_NAMES ? name + '-' : name == '' ? 'root' : name;

-    if (schema.oneOf || schema.anyOf) {
-      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
-        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
-      ).join(' | ');
-
-      return this._addRule(ruleName, rule);
+    const ref = schema.$ref;
+    if (ref !== undefined) {
+      return this._addRule(ruleName, this._resolveRef(ref));
+    } else if (schema.oneOf || schema.anyOf) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
+    } else if (Array.isArray(schemaType)) {
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
    } else if ('const' in schema) {
-      return this._addRule(ruleName, this._formatLiteral(schema.const));
+      return this._addRule(ruleName, this._generateConstantRule(schema.const));
    } else if ('enum' in schema) {
-      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
+      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
      return this._addRule(ruleName, rule);
-    } else if (schemaType === 'object' && 'properties' in schema) {
-      // TODO: `required` keyword (from python implementation)
-      const propOrder = this._propOrder;
-      const propPairs = Object.entries(schema.properties).sort((a, b) => {
-        // sort by position in prop_order (if specified) then by key
-        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
-        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
-        return orderA - orderB || a[0].localeCompare(b[0]);
-      });
-
-      let rule = '"{" space';
-      propPairs.forEach(([propName, propSchema], i) => {
-        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
-        if (i > 0) {
-          rule += ' "," space';
+    } else if ((schemaType === undefined || schemaType === 'object') &&
+               ('properties' in schema ||
+                ('additionalProperties' in schema && schema.additionalProperties !== true))) {
+      const required = new Set(schema.required || []);
+      const properties = Object.entries(schema.properties ?? {});
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, schema.additionalProperties));
+    } else if ((schemaType === undefined || schemaType === 'object') && 'allOf' in schema) {
+      const required = new Set();
+      const properties = [];
+      const addComponent = (compSchema, isRequired) => {
+        const ref = compSchema.$ref;
+        if (ref !== undefined) {
+          compSchema = this._refs[ref];
        }
-        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
-      });
-      rule += ' "}" space';

-      return this._addRule(ruleName, rule);
-    } else if (schemaType === 'array' && 'items' in schema) {
-      // TODO `prefixItems` keyword (from python implementation)
-      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
-      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
-      return this._addRule(ruleName, rule);
+        if ('properties' in compSchema) {
+          for (const [propName, propSchema] of Object.entries(compSchema.properties)) {
+            properties.push([propName, propSchema]);
+            if (isRequired) {
+              required.add(propName);
+            }
+          }
+        }
+      };
+
+      for (const t of schema.allOf) {
+        if ('anyOf' in t) {
+          for (const tt of t.anyOf) {
+            addComponent(tt, false);
+          }
+        } else {
+          addComponent(t, true);
+        }
+      }
+
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
+    } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
+      const items = schema.items ?? schema.prefixItems;
+      if (Array.isArray(items)) {
+        return this._addRule(
+          ruleName,
+          '"[" space ' +
+            items.map((item, i) => this.visit(item, `${name ?? ''}${name ? '-' : ''}tuple-${i}`)).join(' "," space ') +
+            ' "]" space'
+        );
+      } else {
+        const itemRuleName = this.visit(items, `${name ?? ''}${name ? '-' : ''}item`);
+        const listItemOperator = `( "," space ${itemRuleName} )`;
+        let successiveItems = '';
+        let minItems = schema.minItems || 0;
+        const maxItems = schema.maxItems;
+        if (minItems > 0) {
+          successiveItems = listItemOperator.repeat(minItems - 1);
+          minItems--;
+        }
+        if (maxItems !== undefined && maxItems > minItems) {
+          successiveItems += `${listItemOperator}?`.repeat(maxItems - minItems - 1);
+        } else {
+          successiveItems += `${listItemOperator}*`;
+        }
+        const rule = minItems === 0
+          ? `"[" space ( ${itemRuleName} ${successiveItems} )? "]" space`
+          : `"[" space ${itemRuleName} ${successiveItems} "]" space`;
+        return this._addRule(ruleName, rule);
+      }
+    } else if ((schemaType === undefined || schemaType === 'string') && 'pattern' in schema) {
+      return this._visitPattern(schema.pattern, ruleName);
+    } else if ((schemaType === undefined || schemaType === 'string') && /^uuid[1-5]?$/.test(schema.format || '')) {
+      return this._addRule(
+          ruleName === 'root' ? 'root' : schemaFormat,
+          PRIMITIVE_RULES['uuid'])
+    } else if ((schemaType === undefined || schemaType === 'string') && schema.format in DATE_RULES) {
+      for (const [t, r] of Object.entries(DATE_RULES)) {
+        this._addRule(t, r);
+      }
+      return schemaFormat + '-string';
+    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
+      for (const n of OBJECT_RULE_NAMES) {
+        this._addRule(n, PRIMITIVE_RULES[n]);
+      }
+      return this._addRule(ruleName, 'object');
    } else {
-      if (!PRIMITIVE_RULES[schemaType]) {
+      if (!(schemaType in PRIMITIVE_RULES)) {
        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
      }
-      return this._addRule(
-        ruleName === 'root' ? 'root' : schemaType,
-        PRIMITIVE_RULES[schemaType]
+      // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero
+      return this._addRule(ruleName === 'root' ? 'root' : schemaType, PRIMITIVE_RULES[schemaType]);
+    }
+  }
+
+  _buildObjectRule(properties, required, name, additionalProperties) {
+    const propOrder = this._propOrder;
+    // sort by position in prop_order (if specified) then by original order
+    const sortedProps = properties.map(([k]) => k).sort((a, b) => {
+      const orderA = propOrder[a] || Infinity;
+      const orderB = propOrder[b] || Infinity;
+      return orderA - orderB || properties.findIndex(([k]) => k === a) - properties.findIndex(([k]) => k === b);
+    });
+
+    const propKvRuleNames = {};
+    for (const [propName, propSchema] of properties) {
+      const propRuleName = this.visit(propSchema, `${name ?? ''}${name ? '-' : ''}${propName}`);
+      propKvRuleNames[propName] = this._addRule(
+        `${name ?? ''}${name ? '-' : ''}${propName}-kv`,
+        `${this._formatLiteral(JSON.stringify(propName))} space ":" space ${propRuleName}`
      );
    }
+    const requiredProps = sortedProps.filter(k => required.has(k));
+    const optionalProps = sortedProps.filter(k => !required.has(k));
+
+    if (typeof additionalProperties === 'object' || additionalProperties === true) {
+      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
+      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
+      propKvRuleNames['*'] = this._addRule(
+        `${subName}-kv`,
+        `${this._addRule('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+      optionalProps.push('*');
+    }
+
+    let rule = '"{" space ';
+    rule += requiredProps.map(k => propKvRuleNames[k]).join(' "," space ');
+
+    if (optionalProps.length > 0) {
+      rule += ' (';
+      if (requiredProps.length > 0) {
+        rule += ' "," space ( ';
+      }
+
+      const getRecursiveRefs = (ks, firstIsOptional) => {
+        const [k, ...rest] = ks;
+        const kvRuleName = propKvRuleNames[k];
+        let res;
+        if (k === '*') {
+            res = this._addRule(
+                `${name ?? ''}${name ? '-' : ''}additional-kvs`,
+                `${kvRuleName} ( "," space ` + kvRuleName + ` )*`
+            )
+        } else if (firstIsOptional) {
+          res = `( "," space ${kvRuleName} )?`;
+        } else {
+          res = kvRuleName;
+        }
+        if (rest.length > 0) {
+          res += ' ' + this._addRule(
+            `${name ?? ''}${name ? '-' : ''}${k}-rest`,
+            getRecursiveRefs(rest, true)
+          );
+        }
+        return res;
+      };
+
+      rule += optionalProps.map((_, i) => getRecursiveRefs(optionalProps.slice(i), false)).join(' | ');
+      if (requiredProps.length > 0) {
+        rule += ' )';
+      }
+      rule += ' )?';
+    }
+
+    rule += ' "}" space';
+
+    return rule;
  }

  formatGrammar() {
    let grammar = '';
-    this._rules.forEach((rule, name) => {
+    for (const [name, rule] of Object.entries(this._rules).sort(([a], [b]) => a.localeCompare(b))) {
      grammar += `${name} ::= ${rule}\n`;
-    });
+    }
    return grammar;
  }
 }
+
+// Helper function to group elements by a key function
+function* groupBy(iterable, keyFn) {
+  let lastKey = null;
+  let group = [];
+  for (const element of iterable) {
+    const key = keyFn(element);
+    if (lastKey !== null && key !== lastKey) {
+      yield [lastKey, group];
+      group = [];
+    }
+    group.push(element);
+    lastKey = key;
+  }
+  if (group.length > 0) {
+    yield [lastKey, group];
+  }
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,7 @@
 #include "utils.hpp"

 #include "common.h"
+#include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "grammar-parser.h"

@@ -29,7 +30,7 @@
 #include <signal.h>
 #include <memory>

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 bool server_verbose = false;
 bool server_log_json = true;
@@ -98,6 +99,7 @@ struct slot_params {

    uint32_t seed      = -1; // RNG seed
    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t  n_predict = -1; // new tokens to predict

    std::vector<std::string> antiprompt;
@@ -178,6 +180,7 @@ struct server_slot {
    llama_token sampled;
    struct llama_sampling_params sparams;
    llama_sampling_context * ctx_sampling = nullptr;
+    json json_schema;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -744,7 +747,8 @@ struct server_context {
        {
            const int32_t n_batch = llama_n_batch(ctx);

-            batch = llama_batch_init(n_batch, 0, params.n_parallel);
+            // only a single seq_id per token is needed
+            batch = llama_batch_init(n_batch, 0, 1);
        }

        metrics.init();
@@ -844,11 +848,27 @@ struct server_context {
        slot.sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot.sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot.params.n_keep             = json_value(data, "n_keep",            slot.params.n_keep);
+        slot.params.n_discard          = json_value(data, "n_discard",         default_params.n_discard);
        slot.params.seed               = json_value(data, "seed",              default_params.seed);
-        slot.sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot.sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

+        // process "json_schema" and "grammar"
+        if (data.contains("json_schema") && data.contains("grammar")) {
+            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
+            return false;
+        } else if (data.contains("json_schema") && !data.contains("grammar")) {
+            try {
+                auto schema                = json_value(data, "json_schema", json::object());
+                slot.sparams.grammar       = json_schema_to_grammar(schema);
+            } catch (const std::exception & e) {
+                send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
+                return false;
+            }
+        } else {
+            slot.sparams.grammar       = json_value(data, "grammar",           default_sparams.grammar);
+        }
+
        if (slot.params.cache_prompt && slot.ga_n != 1) {
            LOG_WARNING("cache_prompt is not supported with group-attention", {});
            slot.params.cache_prompt = false;
@@ -1235,7 +1255,8 @@ struct server_context {
            {"penalize_nl",               slot.sparams.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"n_predict",                 slot.params.n_predict}, // TODO: fix duplicate key n_predict
-            {"n_keep",                    params.n_keep},
+            {"n_keep",                    slot.params.n_keep},
+            {"n_discard",                 slot.params.n_discard},
            {"ignore_eos",                ignore_eos},
            {"stream",                    slot.params.stream},
            {"logit_bias",                slot.sparams.logit_bias},
@@ -1679,7 +1700,7 @@ struct server_context {
                    // Shift context
                    const int n_keep    = slot.params.n_keep + add_bos_token;
                    const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
-                    const int n_discard = n_left / 2;
+                    const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);

                    LOG_INFO("slot context shift", {
                        {"id_slot",         slot.id},
@@ -1746,7 +1767,7 @@ struct server_context {
        }

        // process in chunks of params.n_batch
-        int32_t n_batch = llama_n_batch(ctx);
+        int32_t n_batch  = llama_n_batch(ctx);
        int32_t n_ubatch = llama_n_ubatch(ctx);

        // next, batch any pending prompts without exceeding n_batch
@@ -2195,6 +2216,12 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    }
    printf("  -m FNAME, --model FNAME\n");
    printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                            model download url (default: unused)\n");
+    printf("  -hfr REPO, --hf-repo REPO\n");
+    printf("                            Hugging Face model repository (default: unused)\n");
+    printf("  -hff FILE, --hf-file FILE\n");
+    printf("                            Hugging Face model file (default: unused)\n");
    printf("  -a ALIAS, --alias ALIAS\n");
    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@@ -2211,7 +2238,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
    printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
    printf("  -spf FNAME, --system-prompt-file FNAME\n");
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
@@ -2317,6 +2344,24 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                break;
            }
            params.model = argv[i];
+        } else if (arg == "-mu" || arg == "--model-url") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_url = argv[i];
+        } else if (arg == "-hfr" || arg == "--hf-repo") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_repo = argv[i];
+        } else if (arg == "-hff" || arg == "--hf-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hf_file = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
@@ -2469,15 +2514,15 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CUDA
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
            std::string arg_next = argv[i];

            // split string by , and /
@@ -2494,17 +2539,17 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUBLAS
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUDA
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
            params.main_gpu = std::stoi(argv[i]);
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
+            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
 #endif
        } else if (arg == "--lora") {
            if (++i >= argc) {
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
 To run a scenario annotated with `@bug`, start:

 ```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug
+DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
 ```

 After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -4,7 +4,8 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
+    And   a model file ggml-model-f16.gguf
    And   a model alias bert-bge-small
    And   42 as server seed
    And   2 slots
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -1,17 +1,18 @@
-import errno
 import os
-import socket
-import subprocess
-import time
-from contextlib import closing
 import signal
+import socket
+import sys
+import time
+import traceback
+from contextlib import closing
+from subprocess import TimeoutExpired


 def before_scenario(context, scenario):
    context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON'
    if context.debug:
-        print("DEBUG=ON\n")
-    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n")
+        print("DEBUG=ON")
+    print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m")
    port = 8080
    if 'PORT' in os.environ:
        port = int(os.environ['PORT'])
@@ -20,58 +21,45 @@ def before_scenario(context, scenario):


 def after_scenario(context, scenario):
-    if context.server_process is None:
-        return
-    if scenario.status == "failed":
-        if 'GITHUB_ACTIONS' in os.environ:
-            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
-            if os.path.isfile('llama.log'):
-                with closing(open('llama.log', 'r')) as f:
-                    for line in f:
-                        print(line)
-        if not is_server_listening(context.server_fqdn, context.server_port):
-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
+    try:
+        if 'server_process' not in context or context.server_process is None:
+            return
+        if scenario.status == "failed":
+            if 'GITHUB_ACTIONS' in os.environ:
+                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n")
+                if os.path.isfile('llama.log'):
+                    with closing(open('llama.log', 'r')) as f:
+                        for line in f:
+                            print(line)
+            if not is_server_listening(context.server_fqdn, context.server_port):
+                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m")

-    if not pid_exists(context.server_process.pid):
-        assert False, f"Server not running pid={context.server_process.pid} ..."
+        if context.server_process.poll() is not None:
+            assert False, f"Server not running pid={context.server_process.pid} ..."

-    server_graceful_shutdown(context)
+        server_graceful_shutdown(context)  # SIGINT

-    # Wait few for socket to free up
-    time.sleep(0.05)
+        try:
+            context.server_process.wait(0.5)
+        except TimeoutExpired:
+            print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...")
+            context.server_process.kill()  # SIGKILL
+            context.server_process.wait()

-    attempts = 0
-    while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
-        server_kill(context)
-        time.sleep(0.1)
-        attempts += 1
-        if attempts > 5:
-            server_kill_hard(context)
+        while is_server_listening(context.server_fqdn, context.server_port):
+            time.sleep(0.1)
+    except Exception:
+        print("ignoring error in after_scenario:")
+        traceback.print_exc(file=sys.stdout)


 def server_graceful_shutdown(context):
-    print(f"shutting down server pid={context.server_process.pid} ...\n")
+    print(f"shutting down server pid={context.server_process.pid} ...")
    if os.name == 'nt':
-        os.kill(context.server_process.pid, signal.CTRL_C_EVENT)
+        interrupt = signal.CTRL_C_EVENT
    else:
-        os.kill(context.server_process.pid, signal.SIGINT)
-
-
-def server_kill(context):
-    print(f"killing server pid={context.server_process.pid} ...\n")
-    context.server_process.kill()
-
-
-def server_kill_hard(context):
-    pid = context.server_process.pid
-    path = context.server_path
-
-    print(f"Server dangling exits, hard killing force {pid}={path}...\n")
-    if os.name == 'nt':
-        process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
-        print(process)
-    else:
-        os.kill(-pid, signal.SIGKILL)
+        interrupt = signal.SIGINT
+    context.server_process.send_signal(interrupt)


 def is_server_listening(server_fqdn, server_port):
@@ -79,22 +67,5 @@ def is_server_listening(server_fqdn, server_port):
        result = sock.connect_ex((server_fqdn, server_port))
        _is_server_listening = result == 0
        if _is_server_listening:
-            print(f"server is listening on {server_fqdn}:{server_port}...\n")
+            print(f"server is listening on {server_fqdn}:{server_port}...")
        return _is_server_listening
-
-
-def pid_exists(pid):
-    """Check whether pid exists in the current process table."""
-    if pid < 0:
-        return False
-    if os.name == 'nt':
-        output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
-        print(output)
-        return "No tasks are running" not in output
-    else:
-        try:
-            os.kill(pid, 0)
-        except OSError as e:
-            return e.errno == errno.EPERM
-        else:
-            return True
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@@ -4,7 +4,8 @@ Feature: Parallel

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
+    And   a model file test-model-00001-of-00003.gguf
    And   42 as server seed
    And   128 as batch size
    And   256 KV cache size
--- a/examples/server/tests/features/security.feature
+++ b/examples/server/tests/features/security.feature
@@ -37,6 +37,22 @@ Feature: Security
      | llama.cpp | no        |
      | hackme    | raised    |

+  Scenario Outline: OAI Compatibility (invalid response formats)
+    Given a system prompt test
+    And   a user prompt test
+    And   a response format <response_format>
+    And   a model test
+    And   2 max tokens to predict
+    And   streaming is disabled
+    Given an OAI compatible chat completions request with raised api error
+
+    Examples: Prompts
+      | response_format                                       |
+      | {"type": "sound"}                                     |
+      | {"type": "json_object", "schema": 123}                |
+      | {"type": "json_object", "schema": {"type": 123}}      |
+      | {"type": "json_object", "schema": {"type": "hiccup"}} |
+

  Scenario Outline: CORS Options
    Given a user api key llama.cpp
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -5,6 +5,7 @@ Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
@@ -34,9 +35,9 @@ Feature: llama.cpp server
    And   metric llamacpp:tokens_predicted is <n_predicted>

    Examples: Prompts
-      | prompt                                                                    | n_predict | re_content                    | n_prompt | n_predicted | truncated |
-      | I believe the meaning of life is                                          | 8         | (read\|going)+                | 18       | 8           | not       |
-      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids)+ | 46       | 64          | not       |
+      | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
+      | I believe the meaning of life is                                          | 8         | (read\|going)+                              | 18       | 8           | not       |
+      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 46       | 64          | not       |

  Scenario: Completion prompt truncated
    Given a prompt:
@@ -47,7 +48,7 @@ Feature: llama.cpp server
    Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
    """
    And   a completion request with no api error
-    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry
+    Then  64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl
    And   the completion is  truncated
    And   109 prompt tokens are processed

@@ -64,9 +65,25 @@ Feature: llama.cpp server
    And   the completion is <truncated> truncated

    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+          | 77       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird)+ | -1       | 64          | enabled          |           |
+      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
+      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
+
+
+  Scenario Outline: OAI Compatibility w/ response format
+    Given a model test
+    And   a system prompt test
+    And   a user prompt test
+    And   a response format <response_format>
+    And   10 max tokens to predict
+    Given an OAI compatible chat completions request with no api error
+    Then  <n_predicted> tokens are predicted matching <re_content>
+
+    Examples: Prompts
+      | response_format                                                     | n_predicted | re_content             |
+      | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
+      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
+      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |


  Scenario: Tokenize / Detokenize
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -5,6 +5,8 @@ import os
 import re
 import socket
 import subprocess
+import sys
+import threading
 import time
 from contextlib import closing
 from re import RegexFlag
@@ -14,7 +16,6 @@ import numpy as np
 import openai
 from behave import step
 from behave.api.async_step import async_run_until_complete
-from huggingface_hub import hf_hub_download
 from prometheus_client import parser


@@ -22,22 +23,29 @@ from prometheus_client import parser
 def step_server_config(context, server_fqdn, server_port):
    context.server_fqdn = server_fqdn
    context.server_port = int(server_port)
+    context.n_gpu_layer = None
    if 'PORT' in os.environ:
        context.server_port = int(os.environ['PORT'])
        print(f"$PORT set, overriding server port with to {context.server_port}")
    if 'FQDN' in os.environ:
        context.server_fqdn = os.environ['FQDN']
        print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}")
+    if 'N_GPU_LAYERS' in os.environ:
+        context.n_gpu_layer = int(os.environ['N_GPU_LAYERS'])
+        print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}")

    context.base_url = f'http://{context.server_fqdn}:{context.server_port}'

    context.model_alias = None
+    context.model_file = None
+    context.model_hf_repo = None
+    context.model_hf_file = None
+    context.model_url = None
    context.n_batch = None
    context.n_ubatch = None
    context.n_ctx = None
    context.n_ga = None
    context.n_ga_w = None
-    context.n_gpu_layer = None
    context.n_predict = None
    context.n_prompts = 0
    context.n_server_predict = None
@@ -52,6 +60,7 @@ def step_server_config(context, server_fqdn, server_port):
    context.seed = None
    context.server_seed = None
    context.user_api_key = None
+    context.response_format = None

    context.tasks_result = []
    context.concurrent_tasks = []
@@ -60,9 +69,19 @@ def step_server_config(context, server_fqdn, server_port):

@step('a model file {hf_file} from HF repo {hf_repo}')
 def step_download_hf_model(context, hf_file, hf_repo):
-    context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file)
-    if context.debug:
-        print(f"model file: {context.model_file}\n")
+    context.model_hf_repo = hf_repo
+    context.model_hf_file = hf_file
+    context.model_file = os.path.basename(hf_file)
+
+
+@step('a model file {model_file}')
+def step_model_file(context, model_file):
+    context.model_file = model_file
+
+
+@step('a model url {model_url}')
+def step_model_url(context, model_url):
+    context.model_url = model_url


@step('a model alias {model_alias}')
@@ -123,9 +142,12 @@ def step_start_server(context):
    if 'GITHUB_ACTIONS' in os.environ:
        max_attempts *= 2

+    addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM)
+    family, typ, proto, _, sockaddr = addrs[0]
+
    while True:
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
-            result = sock.connect_ex((context.server_fqdn, context.server_port))
+        with closing(socket.socket(family, typ, proto)) as sock:
+            result = sock.connect_ex(sockaddr)
            if result == 0:
                print("\x1b[33;46mserver started!\x1b[0m")
                return
@@ -141,7 +163,8 @@ def step_start_server(context):
 async def step_wait_for_the_server_to_be_started(context, expecting_status):
    match expecting_status:
        case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok')
+            await wait_for_health_status(context, context.base_url, 200, 'ok',
+                                         timeout=30)

        case 'ready' | 'idle':
            await wait_for_health_status(context, context.base_url, 200, 'ok',
@@ -194,7 +217,7 @@ async def step_request_completion(context, api_error):
                                          user_api_key=context.user_api_key)
    context.tasks_result.append(completion)
    if context.debug:
-        print(f"Completion response: {completion}\n")
+        print(f"Completion response: {completion}")
    if expect_api_error:
        assert completion == 401, f"completion must be an 401 status code: {completion}"

@@ -248,6 +271,11 @@ def step_max_tokens(context, max_tokens):
    context.n_predict = max_tokens


+@step('a response format {response_format}')
+def step_response_format(context, response_format):
+    context.response_format = json.loads(response_format)
+
+
@step('streaming is {enable_streaming}')
 def step_streaming(context, enable_streaming):
    context.enable_streaming = enable_streaming == 'enabled'
@@ -339,7 +367,7 @@ def step_prompt_passkey(context, passkey, i_pos):
        prompt += context.prompt_junk_suffix
    if context.debug:
        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
-        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
+        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```")
    context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
    context.n_prompts = len(context.prompts)

@@ -348,7 +376,7 @@ def step_prompt_passkey(context, passkey, i_pos):
@async_run_until_complete
 async def step_oai_chat_completions(context, api_error):
    if context.debug:
-        print(f"Submitting OAI compatible completions request...\n")
+        print(f"Submitting OAI compatible completions request...")
    expect_api_error = api_error == 'raised'
    completion = await oai_chat_completions(context.prompts.pop(),
                                            context.system_prompt,
@@ -363,6 +391,9 @@ async def step_oai_chat_completions(context, api_error):
                                            enable_streaming=context.enable_streaming
                                            if hasattr(context, 'enable_streaming') else None,

+                                            response_format=context.response_format
+                                            if hasattr(context, 'response_format') else None,
+
                                            seed=await completions_seed(context),

                                            user_api_key=context.user_api_key
@@ -422,6 +453,8 @@ async def step_oai_chat_completions(context):
                              if hasattr(context, 'n_predict') else None,
                              enable_streaming=context.enable_streaming
                              if hasattr(context, 'enable_streaming') else None,
+                              response_format=context.response_format
+                              if hasattr(context, 'response_format') else None,
                              seed=await completions_seed(context),
                              user_api_key=context.user_api_key
                              if hasattr(context, 'user_api_key') else None)
@@ -442,6 +475,8 @@ async def step_oai_chat_completions(context):
                              if hasattr(context, 'n_predict') else None,
                              enable_streaming=context.enable_streaming
                              if hasattr(context, 'enable_streaming') else None,
+                              response_format=context.response_format
+                              if hasattr(context, 'response_format') else None,
                              seed=context.seed
                              if hasattr(context, 'seed') else
                              context.server_seed
@@ -493,12 +528,12 @@ async def step_all_embeddings_are_the_same(context):
            embedding1 = np.array(embeddings[i])
            embedding2 = np.array(embeddings[j])
            if context.debug:
-                print(f"embedding1: {embedding1[-8:]}\n")
-                print(f"embedding2: {embedding2[-8:]}\n")
+                print(f"embedding1: {embedding1[-8:]}")
+                print(f"embedding2: {embedding2[-8:]}")
            similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
            msg = f"Similarity between {i} and {j}: {similarity:.10f}"
            if context.debug:
-                print(f"{msg}\n")
+                print(f"{msg}")
            assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg


@@ -615,7 +650,7 @@ async def step_prometheus_metrics_exported(context):
            metrics_raw = await metrics_response.text()
            metric_exported = False
            if context.debug:
-                print(f"/metrics answer:\n{metrics_raw}\n")
+                print(f"/metrics answer:\n{metrics_raw}")
            context.metrics = {}
            for metric in parser.text_string_to_metric_families(metrics_raw):
                match metric.name:
@@ -724,6 +759,7 @@ async def oai_chat_completions(user_prompt,
                               model=None,
                               n_predict=None,
                               enable_streaming=None,
+                               response_format=None,
                               seed=None,
                               user_api_key=None,
                               expect_api_error=None):
@@ -749,6 +785,8 @@ async def oai_chat_completions(user_prompt,
        "stream": enable_streaming,
        "seed": seed
    }
+    if response_format is not None:
+        payload['response_format'] = response_format
    completion_response = {
        'content': '',
        'timings': {
@@ -809,6 +847,7 @@ async def oai_chat_completions(user_prompt,
                model=model,
                max_tokens=n_predict,
                stream=enable_streaming,
+                response_format=payload.get('response_format'),
                seed=seed
            )
        except openai.error.AuthenticationError as e:
@@ -917,7 +956,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
            last_match = end
        highlighted += content[last_match:]
        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
-          print(f"Checking completion response: {highlighted}\n")
+          print(f"Checking completion response: {highlighted}")
        assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```'
    if expected_predicted_n and expected_predicted_n > 0:
        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
@@ -927,7 +966,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
 async def gather_tasks_results(context):
    n_tasks = len(context.concurrent_tasks)
    if context.debug:
-        print(f"Waiting for all {n_tasks} tasks results...\n")
+        print(f"Waiting for all {n_tasks} tasks results...")
    for task_no in range(n_tasks):
        context.tasks_result.append(await context.concurrent_tasks.pop())
    n_completions = len(context.tasks_result)
@@ -944,7 +983,7 @@ async def wait_for_health_status(context,
                                 slots_processing=None,
                                 expected_slots=None):
    if context.debug:
-        print(f"Starting checking for health for expected_health_status={expected_health_status}\n")
+        print(f"Starting checking for health for expected_health_status={expected_health_status}")
    interval = 0.5
    counter = 0
    if 'GITHUB_ACTIONS' in os.environ:
@@ -1033,13 +1072,18 @@ def start_server_background(context):
    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
    server_listen_addr = context.server_fqdn
-    if os.name == 'nt':
-        server_listen_addr = '0.0.0.0'
    server_args = [
        '--host', server_listen_addr,
        '--port', context.server_port,
-        '--model', context.model_file
    ]
+    if context.model_file:
+        server_args.extend(['--model', context.model_file])
+    if context.model_url:
+        server_args.extend(['--model-url', context.model_url])
+    if context.model_hf_repo:
+        server_args.extend(['--hf-repo', context.model_hf_repo])
+    if context.model_hf_file:
+        server_args.extend(['--hf-file', context.model_hf_file])
    if context.n_batch:
        server_args.extend(['--batch-size', context.n_batch])
    if context.n_ubatch:
@@ -1070,7 +1114,7 @@ def start_server_background(context):
        server_args.append('--verbose')
    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
        server_args.extend(['--log-format', "text"])
-    print(f"starting server with: {context.server_path} {server_args}\n")
+    print(f"starting server with: {context.server_path} {server_args}")
    flags = 0
    if 'nt' == os.name:
        flags |= subprocess.DETACHED_PROCESS
@@ -1079,8 +1123,23 @@ def start_server_background(context):

    pkwargs = {
        'creationflags': flags,
+        'stdout': subprocess.PIPE,
+        'stderr': subprocess.PIPE
    }
    context.server_process = subprocess.Popen(
        [str(arg) for arg in [context.server_path, *server_args]],
        **pkwargs)
+
+    def log_stdout(process):
+        for line in iter(process.stdout.readline, b''):
+            print(line.decode('utf-8'), end='')
+    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
+    thread_stdout.start()
+
+    def log_stderr(process):
+        for line in iter(process.stderr.readline, b''):
+            print(line.decode('utf-8'), end='', file=sys.stderr)
+    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
+    thread_stderr.start()
+
    print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -12,7 +12,7 @@

 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"

-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
 enum error_type {
@@ -95,8 +95,8 @@ static inline void server_log(const char *level, const char *function, int line,

        const std::string str = ss.str();
        printf("%.*s\n", (int)str.size(), str.data());
-        fflush(stdout);
    }
+    fflush(stdout);
 }

 //
@@ -352,39 +352,71 @@ static json oaicompat_completion_params_parse(
    // https://platform.openai.com/docs/api-reference/chat/create
    llama_sampling_params default_sparams;
    llama_params["model"]             = json_value(body,   "model",             std::string("unknown"));
-    llama_params["prompt"]            = format_chat(model, chat_template,       body["messages"]);
-    llama_params["cache_prompt"]      = json_value(body,   "cache_prompt",      false);
-    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
-    llama_params["top_k"]             = json_value(body,   "top_k",             default_sparams.top_k);
-    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);
-    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
-    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
    llama_params["frequency_penalty"] = json_value(body,   "frequency_penalty", 0.0);
+    llama_params["logit_bias"]        = json_value(body,   "logit_bias",        json::object());
+    llama_params["n_predict"]         = json_value(body,   "max_tokens",        -1);
    llama_params["presence_penalty"]  = json_value(body,   "presence_penalty",  0.0);
    llama_params["seed"]              = json_value(body,   "seed",              LLAMA_DEFAULT_SEED);
    llama_params["stream"]            = json_value(body,   "stream",            false);
-    llama_params["mirostat"]          = json_value(body,   "mirostat",          default_sparams.mirostat);
-    llama_params["mirostat_tau"]      = json_value(body,   "mirostat_tau",      default_sparams.mirostat_tau);
-    llama_params["mirostat_eta"]      = json_value(body,   "mirostat_eta",      default_sparams.mirostat_eta);
-    llama_params["penalize_nl"]       = json_value(body,   "penalize_nl",       default_sparams.penalize_nl);
-    llama_params["typical_p"]         = json_value(body,   "typical_p",         default_sparams.typical_p);
-    llama_params["repeat_last_n"]     = json_value(body,   "repeat_last_n",     default_sparams.penalty_last_n);
-    llama_params["ignore_eos"]        = json_value(body,   "ignore_eos",        false);
-    llama_params["tfs_z"]             = json_value(body,   "tfs_z",             default_sparams.tfs_z);
+    llama_params["temperature"]       = json_value(body,   "temperature",       0.0);
+    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

-    if (body.count("grammar") != 0) {
-        llama_params["grammar"] = json_value(body, "grammar", json::object());
-    }
+    // Apply chat template to the list of messages
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);

-    // Handle 'stop' field
+    // Handle "stop" field
    if (body.contains("stop") && body["stop"].is_string()) {
        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
+    // Some chat templates don't use EOS token to stop generation
+    // We must add their end sequences to list of stop words
+    llama_params["stop"].push_back("<|im_end|>"); // chatml
+    llama_params["stop"].push_back("<end_of_turn>"); // gemma

-    // Ensure there is ChatML-specific end sequence among stop words
-    llama_params["stop"].push_back("<|im_end|>");
+    // Handle "response_format" field
+    if (body.contains("response_format")) {
+        json response_format      = json_value(body, "response_format", json::object());
+        std::string response_type = json_value(response_format, "type", std::string());
+        if (response_type == "json_object") {
+            llama_params["json_schema"] = json_value(response_format, "schema", json::object());
+        } else if (!response_type.empty() && response_type != "text") {
+            throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type);
+        }
+    }
+
+    // Handle "n" field
+    int n_choices = json_value(body, "n", 1);
+    if (n_choices != 1) {
+        throw std::runtime_error("Only one completion choice is allowed");
+    }
+
+    // Handle "logprobs" field
+    // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future
+    if (body.contains("logprobs")) {
+        llama_params["n_probs"] = json_value(body, "top_logprobs", 20);
+    } else if (body.contains("top_logprobs")) {
+        throw std::runtime_error("top_logprobs requires logprobs to be set to true");
+    }
+
+    // Params supported by OAI but unsupported by llama.cpp
+    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    for (auto & param : unsupported_params) {
+        if (body.contains(param)) {
+            throw std::runtime_error("Unsupported param: " + param);
+        }
+    }
+
+    // Copy remaining properties to llama_params
+    // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint.
+    // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp
+    for (const auto & item : body.items()) {
+        // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
+        if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
+            llama_params[item.key()] = item.value();
+        }
+    }

    return llama_params;
 }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -65,7 +65,6 @@ int main(int argc, char ** argv) {
    llama_context * ctx_dft = NULL;

    // load the target model
-    params.logits_all = true;
    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);

    // load the draft model
@@ -219,7 +218,8 @@ int main(int argc, char ** argv) {
                if (params.sparams.temp > 0) {
                    // stochastic verification

-                    llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+                    llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL);
+                    llama_sample_softmax(ctx_tgt, &dist_tgt);
                    float p_tgt = 0, p_dft = 0;

                    // GGML_ASSERT(dist_tgt.size() == dist_dft.size());
--- a/examples/sycl/win-build-sycl.bat
+++ b/examples/sycl/win-build-sycl.bat
@@ -3,9 +3,13 @@
 ::  Copyright (C) 2024 Intel Corporation
 ::  SPDX-License-Identifier: MIT

-mkdir -p build
+
+IF not exist build (mkdir build)
 cd build
+if %errorlevel% neq 0 goto ERROR
+
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+if %errorlevel% neq 0 goto ERROR

 ::  for FP16
 ::  faster for long-prompt inference
@@ -13,11 +17,18 @@ cd build

 ::  for FP32
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
-
-
+if %errorlevel% neq 0 goto ERROR
 ::  build example/main only
 ::  make main

 ::  build all binary
 make -j
+if %errorlevel% neq 0 goto ERROR
+
 cd ..
+exit /B 0
+
+:ERROR
+echo comomand error: %errorlevel%
+exit /B %errorlevel%
+
--- a/examples/sycl/win-run-llama2.bat
+++ b/examples/sycl/win-run-llama2.bat
@@ -6,8 +6,6 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force


-set GGML_SYCL_DEVICE=0
-rem set GGML_SYCL_DEBUG=1
 .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0


--- a/examples/ts-type-to-grammar.sh
+++ b/examples/ts-type-to-grammar.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}"
+# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json
+#
+set -euo pipefail
+
+readonly type="$1"
+
+# Create a temporary directory
+TMPDIR=""
+trap 'rm -fR "$TMPDIR"' EXIT
+TMPDIR=$(mktemp -d)
+
+DTS_FILE="$TMPDIR/type.d.ts"
+SCHEMA_FILE="$TMPDIR/schema.json"
+
+echo "export type MyType = $type" > "$DTS_FILE"
+
+# This is a fork of typescript-json-schema, actively maintained as of March 2024:
+# https://github.com/vega/ts-json-schema-generator
+npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type MyType -e none > "$SCHEMA_FILE"
+
+# Alternative, not actively maintained as of March 2024:
+# https://github.com/YousefED/typescript-json-schema
+# npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2
+
+./examples/json-schema-to-grammar.py "$SCHEMA_FILE"
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1709703039,
-        "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
+        "lastModified": 1711163522,
+        "narHash": "sha256-YN/Ciidm+A0fmJPWlHBGvVkcarYWSC+s3NTPk/P+q3c=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
+        "rev": "44d0940ea560dee511026a53f0e2e2cde489b4d4",
        "type": "github"
      },
      "original": {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
    for (int i = 0; i < graph->n_nodes; i++) {
        struct ggml_tensor * node = graph->nodes[i];

-        if (ggml_is_view(node)) {
+        // TODO: better way to add external dependencies
+        // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
+        // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
+        // itself is never used and should not be considered a dependency
+        if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
            struct ggml_tensor * view_src = node->view_src;
            ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
        }
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr

            ggml_gallocr_hash_get(galloc, src)->n_children += 1;

-            // allocate explicit inputs and leafs
-            if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
+            // allocate explicit inputs
+            if (src->flags & GGML_TENSOR_FLAG_INPUT) {
                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
            }
        }
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -103,6 +103,11 @@ extern "C" {
        // check if the backend supports an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

+        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
+        // these should be expensive operations with large batch sizes that may benefit from running on this backend
+        // even if the weight has to be copied from the CPU temporarily
+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+
        // (optional) event synchronization
        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
--- a/Show More
+++ b/Show More