CUDA: fix RoPE asserts, block sizes (#2833 )

llama.h : add missing struct keyword for C compat in callback type (#2847 )
metal : fix memory leak (#2762 )
2026-02-05 13:53:23 +02:00 · 2023-08-28 14:23:55 +03:00 · 2023-08-28 11:19:59 +03:00 · 2023-08-28 10:59:08 +03:00 · 2023-08-28 09:32:25 +03:00 · 2023-08-28 09:31:24 +03:00
126 changed files with 30157 additions and 9081 deletions
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -0,0 +1,44 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@@ -0,0 +1,84 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+
+Name:           llama.cpp-clblast
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        OpenCL Inference of LLaMA model in C/C++
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
+Requires:       clblast
+URL:            https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CLBLAST=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamaclblast
+cp -p server %{buildroot}%{_bindir}/llamaclblastserver
+cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamaclblast
+%{_bindir}/llamaclblastserver
+%{_bindir}/llamaclblastsimple
+/usr/lib/systemd/system/llamaclblast.service
+%config /etc/sysconfig/llama
+
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@@ -0,0 +1,83 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+
+Name:           llama.cpp-cublas
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
+Requires:       cuda-toolkit
+URL:            https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j LLAMA_CUBLAS=1
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llamacppcublas
+cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llamacppcublas
+%{_bindir}/llamacppcublasserver
+%{_bindir}/llamacppcublassimple
+/usr/lib/systemd/system/llamacublas.service
+%config /etc/sysconfig/llama
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -0,0 +1,85 @@
+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# Built and maintained by John Boero - boeroboy@gmail.com
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+#    In the meantime, YYYYMMDD format will be used.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+
+Name:           llama.cpp
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
+Requires:       libstdc++
+URL:            https://github.com/ggerganov/llama.cpp
+
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+
+%description
+CPU inference for Meta's Lllama2 models using default options.
+Models are not included in this package and must be downloaded separately.
+
+%prep
+%setup -n llama.cpp-master
+
+%build
+make -j
+
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p main %{buildroot}%{_bindir}/llama
+cp -p server %{buildroot}%{_bindir}/llamaserver
+cp -p simple %{buildroot}%{_bindir}/llamasimple
+
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+
+[Install]
+WantedBy=default.target
+EOF
+
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+
+%files
+%{_bindir}/llama
+%{_bindir}/llamaserver
+%{_bindir}/llamasimple
+/usr/lib/systemd/system/llama.service
+%config /etc/sysconfig/llama
+
+%pre
+
+%post
+
+%preun
+%postun
+
+%changelog
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -0,0 +1,44 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+
+FROM ${BASE_ROCM_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH=\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102
+
+COPY requirements.txt requirements.txt
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV LLAMA_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+
+RUN make
+
+ENTRYPOINT [ "/app/main" ]
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,14 +5,7 @@
 .vscode/
 .DS_Store

-build/
-build-em/
-build-debug/
-build-release/
-build-static/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build*/

 models/*

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -291,24 +291,32 @@ jobs:
          cd build
          ctest -C Release --verbose --timeout 900

-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi

      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v3
        with:
          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip

  windows-latest-cmake-cublas:
    runs-on: windows-latest
@@ -338,23 +346,31 @@ jobs:
          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release

-      - name: Get commit hash
-        id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: pr-mpt/actions-commit-hash@v2
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi

      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        uses: actions/upload-artifact@v3
        with:
          path: |
-            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
        if: ${{ matrix.cuda == '12.1.0' }}
@@ -400,21 +416,34 @@ jobs:
      - windows-latest-cmake-cublas

    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
      - name: Download artifacts
        id: download-artifact
        uses: actions/download-artifact@v3

-      - name: Get commit hash
-        id: commit
-        uses: pr-mpt/actions-commit-hash@v2
-
      - name: Create release
        id: create_release
        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+          tag_name: ${{ steps.tag.outputs.name }}

      - name: Upload release
        id: upload_release
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,10 @@
 *.o
 *.a
 *.so
+*.gguf
+*.bin
+*.exe
+*.dll
 .DS_Store
 .build/
 .cache/
@@ -12,20 +16,7 @@
 .vs/
 .vscode/

-build/
-build-em/
-build-debug/
-build-release/
-build-ci-debug/
-build-ci-release/
-build-static/
-build-cublas/
-build-opencl/
-build-metal/
-build-mpi/
-build-no-accel/
-build-sanitize-addr/
-build-sanitize-thread/
+build*/
 out/
 tmp/

@@ -39,19 +30,24 @@ models-mnt
 /perplexity
 /embedding
 /train-text-from-scratch
+/convert-llama2c-to-ggml
 /simple
 /benchmark-matmult
 /vdot
 /server
 /Pipfile
 /embd-input-test
+/gguf
+/gguf-llama-simple
 /libllama.so
+/llama-bench
 build-info.h
 arm_neon.h
 compile_commands.json
 CMakeSettings.json

 __pycache__
+dist

 zig-out/
 zig-cache/
@@ -62,12 +58,11 @@ perf-*.txt

 examples/jeopardy/results.txt

-
-pyproject.toml
 poetry.lock
 poetry.toml

 # Test binaries
+tests/test-grammar-parser
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
@@ -75,4 +70,3 @@ tests/test-quantize-fns
 tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0
-
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,12 +67,14 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
+option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
+option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
@@ -251,6 +253,9 @@ if (LLAMA_CUBLAS)
        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

        add_compile_definitions(GGML_USE_CUBLAS)
+#        if (LLAMA_CUDA_CUBLAS)
+#            add_compile_definitions(GGML_CUDA_CUBLAS)
+#        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -259,8 +264,8 @@ if (LLAMA_CUBLAS)
        if (DEFINED LLAMA_CUDA_DMMV_Y)
            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
        endif()
-        if (LLAMA_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_DMMV_F16)
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            add_compile_definitions(GGML_CUDA_F16)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

@@ -271,10 +276,14 @@ if (LLAMA_CUBLAS)
        endif()

    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
+        # 52 == lowest CUDA 12 standard
+        # 60 == f16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
@@ -288,7 +297,6 @@ if (LLAMA_METAL)
    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)

    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

@@ -305,7 +313,6 @@ if (LLAMA_METAL)
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
        )
 endif()

@@ -346,6 +353,43 @@ if (LLAMA_CLBLAST)
    endif()
 endif()

+if (LLAMA_HIPBLAS)
+    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
+
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
+    endif()
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+    endif()
+
+    find_package(hip)
+    find_package(hipblas)
+    find_package(rocblas)
+
+    if (${hipblas_FOUND} AND ${hip_FOUND})
+        message(STATUS "HIP and hipBLAS found")
+        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
+        if (LLAMA_CUDA_FORCE_DMMV)
+            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
+        endif()
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
+        target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
+        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
+        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
+
+        if (LLAMA_STATIC)
+            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
+        endif()
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
+    else()
+        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@@ -357,6 +401,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
+            -Wmissing-prototypes
        )
        set(cxx_flags
            -Wall
@@ -490,12 +535,16 @@ else()
 endif()

 #
-# Build libraries
+# libraries
 #

+# ggml
+
 add_library(ggml OBJECT
            ggml.c
            ggml.h
+            ggml-alloc.c
+            ggml-alloc.h
            ${GGML_SOURCES_CUDA}
            ${GGML_SOURCES_OPENCL}
            ${GGML_SOURCES_METAL}
@@ -515,10 +564,11 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS ggml_shared LIBRARY)
 endif()

+# llama
+
 add_library(llama
            llama.cpp
            llama.h
-            llama-util.h
            )

 target_include_directories(llama PUBLIC .)
@@ -537,6 +587,10 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS llama LIBRARY)
 endif()

+#
+# install
+#
+
 include(GNUInstallDirs)
 install(
    FILES convert.py
@@ -560,11 +614,23 @@ install(
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
+if (LLAMA_METAL)
+    install(
+        FILES ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()

 #
 # programs, examples and tests
 #

+add_subdirectory(common)
+
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
--- a/136
+++ b/136
@@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench

 # Binaries only useful for tests
-TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0

 default: $(BUILD_TARGETS)

@@ -45,8 +45,8 @@ OPT = -Ofast
 else
 OPT = -O3
 endif
-CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
+CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
 LDFLAGS  =

 ifdef LLAMA_DEBUG
@@ -63,7 +63,8 @@ ifdef LLAMA_SERVER_VERBOSE
 endif

 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+			-Wmissing-prototypes
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

 # OS specific
@@ -141,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#CXXFLAGS += -mssse3
 endif

+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	# Apple M1, M2, etc.
+	# Raspberry Pi 3, 4, Zero 2 (64-bit)
+	CFLAGS   += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, Zero
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 2
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 3, 4, Zero 2 (32-bit)
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -193,7 +216,7 @@ ifdef LLAMA_CUBLAS
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
-	NVCCFLAGS = --forward-unknown-to-host-compiler
+	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
@@ -219,19 +242,25 @@ else ifdef LLAMA_CUDA_DMMV_Y
 else
 	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
+ifdef LLAMA_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
+endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
+#ifdef LLAMA_CUDA_CUBLAS
+#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
@@ -251,35 +280,37 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # LLAMA_CLBLAST

+ifdef LLAMA_HIPBLAS
+	ROCM_PATH	?= /opt/rocm
+	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	LLAMA_CUDA_DMMV_X       ?= 32
+	LLAMA_CUDA_MMV_Y        ?= 1
+	LLAMA_CUDA_KQUANTS_ITER ?= 2
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	LDFLAGS		+= -lhipblas -lamdhip64 -lrocblas
+	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
+	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	HIPFLAGS    += -DCC_TURING=1000000000
+ifdef LLAMA_CUDA_FORCE_DMMV
+	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
+endif # LLAMA_CUDA_FORCE_DMMV
+	OBJS        += ggml-cuda.o
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # LLAMA_HIPBLAS
+
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL

-ifneq ($(filter aarch64%,$(UNAME_M)),)
-	# Apple M1, M2, etc.
-	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	CFLAGS   += -mcpu=native
-	CXXFLAGS += -mcpu=native
-endif
-
-ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, Zero
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-endif
-
-ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 2
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-endif
-
-ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
-endif
-
 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
@@ -317,26 +348,34 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

-llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+OBJS += ggml-alloc.o
+
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common.o: examples/common.cpp examples/common.h
+common.o: common/common.cpp common/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
+console.o: common/console.cpp common/console.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)

 #
 # Examples
 #

-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@@ -360,7 +399,7 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
@@ -370,7 +409,16 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 build-info.h: $(wildcard .git/index) scripts/build-info.sh
@@ -394,13 +442,19 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
--- a/README.md
+++ b/README.md
@@ -9,13 +9,25 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

-**Hot topics:**
+### Hot topics

- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810
+
+- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
+
+- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
+
+- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
+
+  Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
+
+  ### Current `master` should be considered in Beta - expect some issues for a few days!
+
+  ### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
+
+  ### Issues with non-GGUF models will be considered with low priority!
+
+----

 <details>
  <summary>Table of Contents</summary>
@@ -33,6 +45,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
+        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
        <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
        <li><a href="#using-gpt4all">Using GPT4All</a></li>
@@ -59,12 +72,11 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
- 4-bit, 5-bit and 8-bit integer quantization support
- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
- cuBLAS and CLBlast support
+- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
+- CUDA, Metal and OpenCL GPU backend support

 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
-Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
+Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
 as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.

 **Supported platforms:**
@@ -77,9 +89,11 @@ as the main playground for developing new features for the [ggml](https://github
 **Supported models:**

 - [X] LLaMA 🦙
+- [x] LLaMA 2 🦙🦙
+- [X] Falcon
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -87,6 +101,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
+- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)

 **Bindings:**

@@ -94,8 +109,10 @@ as the main playground for developing new features for the [ggml](https://github
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)

 **UI:**

@@ -104,90 +121,84 @@ as the main playground for developing new features for the [ggml](https://github

 ---

-Here is a typical run using LLaMA-7B:
+Here is a typical run using LLaMA v2 13B on M2 Ultra:

 ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+$ make -j && ./main -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
-I CFLAGS:   -I.              -O3 -DNDEBUG -std=c11   -fPIC -pthread -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC -pthread
+I CFLAGS:   -I.            -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE
+I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS
 I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.0 (clang-1400.0.29.202)
-I CXX:      Apple clang version 14.0.0 (clang-1400.0.29.202)
+I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
+I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)

 make: Nothing to be done for `default'.
-main: seed = 1678486056
-llama_model_load: loading model from './models/7B/ggml-model-q4_0.bin' - please wait ...
-llama_model_load: n_vocab = 32000
-llama_model_load: n_ctx   = 512
-llama_model_load: n_embd  = 4096
-llama_model_load: n_mult  = 256
-llama_model_load: n_head  = 32
-llama_model_load: n_layer = 32
-llama_model_load: n_rot   = 128
-llama_model_load: f16     = 2
-llama_model_load: n_ff    = 11008
-llama_model_load: ggml ctx size = 4529.34 MB
-llama_model_load: memory_size =   512.00 MB, n_mem = 16384
-llama_model_load: .................................... done
-llama_model_load: model size =  4017.27 MB / num tensors = 291
+main: build = 1041 (cf658ad)
+main: seed  = 1692823051
+llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest))
+llama_model_loader: - type  f32:   81 tensors
+llama_model_loader: - type q4_0:  281 tensors
+llama_model_loader: - type q6_K:    1 tensors
+llm_load_print_meta: format         = GGUF V1 (latest)
+llm_load_print_meta: arch           = llama
+llm_load_print_meta: vocab type     = SPM
+llm_load_print_meta: n_vocab        = 32000
+llm_load_print_meta: n_merges       = 0
+llm_load_print_meta: n_ctx_train    = 4096
+llm_load_print_meta: n_ctx          = 512
+llm_load_print_meta: n_embd         = 5120
+llm_load_print_meta: n_head         = 40
+llm_load_print_meta: n_head_kv      = 40
+llm_load_print_meta: n_layer        = 40
+llm_load_print_meta: n_rot          = 128
+llm_load_print_meta: n_gqa          = 1
+llm_load_print_meta: f_norm_eps     = 1.0e-05
+llm_load_print_meta: f_norm_rms_eps = 1.0e-05
+llm_load_print_meta: n_ff           = 13824
+llm_load_print_meta: freq_base      = 10000.0
+llm_load_print_meta: freq_scale     = 1
+llm_load_print_meta: model type     = 13B
+llm_load_print_meta: model ftype    = mostly Q4_0
+llm_load_print_meta: model size     = 13.02 B
+llm_load_print_meta: general.name   = LLaMA v2
+llm_load_print_meta: BOS token = 1 '<s>'
+llm_load_print_meta: EOS token = 2 '</s>'
+llm_load_print_meta: UNK token = 0 '<unk>'
+llm_load_print_meta: LF token  = 13 '<0x0A>'
+llm_load_tensors: ggml ctx size =    0.11 MB
+llm_load_tensors: mem required  = 7024.01 MB (+  400.00 MB per state)
+...................................................................................................
+llama_new_context_with_model: kv self size  =  400.00 MB
+llama_new_context_with_model: compute buffer total size =   75.41 MB

-main: prompt: 'Building a website can be done in 10 simple steps:'
-main: number of tokens in prompt = 15
-     1 -> ''
-  8893 -> 'Build'
-   292 -> 'ing'
-   263 -> ' a'
-  4700 -> ' website'
-   508 -> ' can'
-   367 -> ' be'
-  2309 -> ' done'
-   297 -> ' in'
- 29871 -> ' '
- 29896 -> '1'
- 29900 -> '0'
-  2560 -> ' simple'
-  6576 -> ' steps'
- 29901 -> ':'
-
-sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000
+system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
+sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
+generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0


-Building a website can be done in 10 simple steps:
-1) Select a domain name and web hosting plan
-2) Complete a sitemap
-3) List your products
-4) Write product descriptions
-5) Create a user account
-6) Build the template
-7) Start building the website
-8) Advertise the website
-9) Provide email support
-10) Submit the website to search engines
-A website is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
-The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user's browser.
-The web pages are stored in a web server. The web server is also called a host. When the website is accessed, it is retrieved from the server and displayed on the user's computer.
-A website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
-A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user's screen.
-A website can also be viewed on different devices such as desktops, tablets and smartphones.
-Hence, to have a website displayed on a browser, the website must be hosted.
-A domain name is an address of a website. It is the name of the website.
-The website is known as a website when it is hosted. This means that it is displayed on a host. The host is usually a web server.
-A website can be displayed on different browsers. The browsers are basically the software that renders the website on the user’s screen.
-A website can also be viewed on different devices such as desktops, tablets and smartphones. Hence, to have a website displayed on a browser, the website must be hosted.
-A domain name is an address of a website. It is the name of the website.
-A website is an address of a website. It is a collection of web pages that are formatted with HTML. HTML is the code that defines what the website looks like and how it behaves.
-The HTML code is formatted into a template or a format. Once this is done, it is displayed on the user’s browser.
-A website is known as a website when it is hosted
-
-main: mem per token = 14434244 bytes
-main:     load time =  1332.48 ms
-main:   sample time =  1081.40 ms
-main:  predict time = 31378.77 ms / 61.41 ms per token
-main:    total time = 34036.74 ms
+ Building a website can be done in 10 simple steps:
+Step 1: Find the right website platform.
+Step 2: Choose your domain name and hosting plan.
+Step 3: Design your website layout.
+Step 4: Write your website content and add images.
+Step 5: Install security features to protect your site from hackers or spammers
+Step 6: Test your website on multiple browsers, mobile devices, operating systems etc…
+Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine!
+Step 8: Start marketing and promoting the website via social media channels or paid ads
+Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc…
+Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further!
+How does a Website Work?
+A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable!
+The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking.
+How to
+llama_print_timings:        load time =   576.45 ms
+llama_print_timings:      sample time =   283.10 ms /   400 runs   (    0.71 ms per token,  1412.91 tokens per second)
+llama_print_timings: prompt eval time =   599.83 ms /    19 tokens (   31.57 ms per token,    31.68 tokens per second)
+llama_print_timings:        eval time = 24513.59 ms /   399 runs   (   61.44 ms per token,    16.28 tokens per second)
+llama_print_timings:       total time = 25431.49 ms
 ```

 And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook:
@@ -236,12 +247,17 @@ In order to build llama.cpp you have three different options.
    cmake --build . --config Release
    ```

- Using `Zig`:
+- Using `Zig` (version 0.11 or later):
+
+    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
+    it's also possible to cross compile for other operating systems and architectures:

    ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
    ```

+    The `zig targets` command will give you valid options to use.
+
 -   Using `gmake` (FreeBSD):

    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -282,7 +298,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 Any value larger than 0 will offload the computation to the GPU. For example:

 ```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
 ```

 ### MPI Build
@@ -321,7 +337,7 @@ The above will distribute the computation across 2 processes on the first host a
 Finally, you're ready to run a computation using `mpirun`:

 ```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 ### BLAS Build
@@ -399,14 +415,46 @@ Building the program with BLAS support may lead to some performance improvements

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

+<!---
+  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
+--->
  | Option                  | Legal values           | Default | Description |
  |-------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

+- #### hipBLAS
+
+  This provide BLAS acceleation on HIP supported GPU like AMD GPU.
+  Make sure to have ROCm installed.
+  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+  Windows support is coming soon...
+
+  - Using `make`:
+    ```bash
+    make LLAMA_HIPBLAS=1
+    ```
+  - Using `CMake`:
+    ```bash
+    mkdir build
+    cd build
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
+    cmake --build .
+    ```
+
+  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
+  If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
+  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
+
+  | Option                  | Legal values           | Default | Description |
+  |-------------------------|------------------------|---------|-------------|
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
+
 - #### CLBlast

  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
@@ -487,6 +535,9 @@ Building the program with BLAS support may lead to some performance improvements
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
+  # [Optional] for models using BPE tokenizers
+  ls ./models
+  65B 30B 13B 7B vocab.json

 # install Python dependencies
 python3 -m pip install -r requirements.txt
@@ -494,11 +545,14 @@ python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
 python3 convert.py models/7B/

+  # [Optional] for models using BPE tokenizers
+  python convert.py models/7B/ --vocabtype bpe
+
 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0

 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -518,6 +572,8 @@ As the models are currently fully loaded into memory, you will need adequate dis

 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.

+*(outdated)*
+
 | Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
 |    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
@@ -554,7 +610,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh

 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@@ -580,6 +636,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
 ```

+### Constrained output with grammars
+
+`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+
+```bash
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
 ### Instruction mode with Alpaca

 1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -617,6 +683,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It

 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

+*Note: these instructions are likely obsoleted by the GGUF update*
+
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@@ -650,6 +718,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.

+### Obtaining and using the Facebook LLaMA 2 model
+
+- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
+- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
+  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
+  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
+  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
+  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
+  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
+  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
+- Specify `-eps 1e-5` for best generation quality
+- Specify `-gqa 8` for 70B models to work
+
 ### Verifying the model files

 Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
@@ -679,7 +760,7 @@ If your issue is with model generation quality, then please at least scan the fo
 #### How to run

 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@@ -778,13 +859,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 ### Docker With CUDA
@@ -815,8 +896,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.

 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 ```

 ### Contributing
@@ -846,3 +927,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GBNF grammars](./grammars/README.md)
--- a/build.zig
+++ b/build.zig
@@ -1,68 +1,121 @@
+// Compatible with Zig Version 0.11.0
 const std = @import("std");
-const commit_hash = @embedFile(".git/refs/heads/master");
+const ArrayList = std.ArrayList;
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;

-// Zig Version: 0.11.0-dev.3986+e05c242cd
-pub fn build(b: *std.build.Builder) void {
-    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    config_header: *ConfigHeader,
+    enable_lto: bool,

-    const config_header = b.addConfigHeader(
-        .{ .style = .blank, .include_path = "build-info.h" },
-        .{
-            .BUILD_NUMBER = 0,
-            .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
-        },
-    );
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),

-    const lib = b.addStaticLibrary(.{
-        .name = "llama",
-        .target = target,
-        .optimize = optimize,
-    });
-    lib.linkLibC();
-    lib.linkLibCpp();
-    lib.addIncludePath(".");
-    lib.addIncludePath("./examples");
-    lib.addConfigHeader(config_header);
-    lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
-    b.installArtifact(lib);
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }

-    const examples = .{
-        "main",
-        "baby-llama",
-        "embedding",
-        "metal",
-        "perplexity",
-        "quantize",
-        "quantize-stats",
-        "save-load-state",
-        "server",
-        "simple",
-        "train-text-from-scratch",
-    };
+    fn init(builder: *std.build.Builder) !Maker {
+        const commit_hash = @embedFile(".git/refs/heads/master");
+        const config_header = builder.addConfigHeader(
+            .{ .style = .blank, .include_path = "build-info.h" },
+            .{
+                .BUILD_NUMBER = 0,
+                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+            },
+        );
+        var m = Maker{
+            .builder = builder,
+            .target = builder.standardTargetOptions(.{}),
+            .optimize = builder.standardOptimizeOption(.{}),
+            .config_header = config_header,
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
+        };
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"examples"});
+        return m;
+    }

-    inline for (examples) |example_name| {
-        const exe = b.addExecutable(.{
-            .name = example_name,
-            .target = target,
-            .optimize = optimize,
-        });
-        exe.addIncludePath(".");
-        exe.addIncludePath("./examples");
-        exe.addConfigHeader(config_header);
-        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
-            "examples/common.cpp",
-        }, &.{"-std=c++11"});
-        exe.linkLibrary(lib);
-        b.installArtifact(exe);
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, m.cflags.items);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
+            o.linkLibCpp();
+        }
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
+        return o;
+    }

-        const run_cmd = b.addRunArtifact(exe);
-        run_cmd.step.dependOn(b.getInstallStep());
-        if (b.args) |args| run_cmd.addArgs(args);
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
+        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
+        e.linkLibC();
+        e.linkLibCpp();
+        e.addConfigHeader(m.config_header);
+        m.builder.installArtifact(e);
+        e.want_lto = m.enable_lto;
+        return e;
+    }
+};

-        const run_step = b.step("run-" ++ example_name, "Run the app");
-        run_step.dependOn(&run_cmd.step);
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
+        try make.addFlag("-DGGML_USE_K_QUANTS");
+        const k_quants = make.obj("k_quants", "k_quants.c");
+        try make.objs.append(k_quants);
+    }
+
+    const ggml = make.obj("ggml", "ggml.c");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const llama = make.obj("llama", "llama.cpp");
+    const common = make.obj("common", "examples/common.cpp");
+    const console = make.obj("common", "examples/console.cpp");
+    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
+
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
+
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
    }
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {

    python3 ../convert.py ${path_models}

-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"

    wiki_test_60="${path_wiki}/wiki.test-60.raw"

@@ -196,17 +196,17 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    function check_ppl {
        qnt="$1"
@@ -233,6 +233,48 @@ function gg_run_open_llama_3b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log

+    # lora
+    function compare_ppl {
+        qnt="$1"
+        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
+            return 20
+        fi
+
+        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
+        return 0
+    }
+
+    path_lora="../models-mnt/open-llama/3B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
+
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+
+    python3 ../convert-lora-to-ggml.py ${path_lora}
+
+    # f16
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0 + f16 lora-base
+    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+
    set +e
 }

@@ -242,6 +284,7 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -253,6 +296,11 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 # open_llama_7b_v2
@@ -285,17 +333,17 @@ function gg_run_open_llama_7b_v2 {

    python3 ../convert.py ${path_models}

-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"

    wiki_test="${path_wiki}/wiki.test.raw"

@@ -310,17 +358,17 @@ function gg_run_open_llama_7b_v2 {
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k

-    (time ./bin/main --model ${model_f16}  -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
@@ -359,6 +407,48 @@ function gg_run_open_llama_7b_v2 {
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log

+    # lora
+    function compare_ppl {
+        qnt="$1"
+        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
+            return 20
+        fi
+
+        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
+        return 0
+    }
+
+    path_lora="../models-mnt/open-llama/7B-v2/lora"
+    path_shakespeare="../models-mnt/shakespeare"
+
+    shakespeare="${path_shakespeare}/shakespeare.txt"
+    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
+
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
+    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
+    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
+
+    python3 ../convert-lora-to-ggml.py ${path_lora}
+
+    # f16
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
+    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
+    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # currently not supported by the CUDA backend
+    # q8_0
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
+    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
+    # q8_0 + f16 lora-base
+    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
+    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
+
    set +e
 }

@@ -368,6 +458,7 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf 'OpenLLaMA 7B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -379,6 +470,11 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }

 ## main
@@ -391,6 +487,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    ln -sfn ${mnt_models} ${SRC}/models-mnt

    python3 -m pip install -r ${SRC}/requirements.txt
+    python3 -m pip install --editable gguf-py
 fi

 ret=0
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -0,0 +1,20 @@
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+    common.h
+    common.cpp
+    console.h
+    console.cpp
+    grammar-parser.h
+    grammar-parser.cpp
+    )
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -25,7 +25,6 @@
 #else
 #include <sys/ioctl.h>
 #include <unistd.h>
-#include <wchar.h>
 #endif

 #if defined(_MSC_VER)
@@ -171,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "-gqa" || arg == "--gqa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_gqa = std::stoi(argv[i]);
-        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@@ -195,6 +182,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@@ -269,6 +262,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.cfg_negative_prompt = argv[i];
+        } else if (arg == "--cfg-negative-prompt-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            if (params.cfg_negative_prompt.back() == '\n') {
+                params.cfg_negative_prompt.pop_back();
+            }
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_param = true;
@@ -281,7 +289,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
        } else if (arg == "--keep") {
            if (++i >= argc) {
                invalid_param = true;
@@ -329,6 +336,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.instruct = true;
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
+        } else if (arg == "--simple-io") {
+            params.simple_io = true;
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "--mlock") {
@@ -352,7 +361,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #ifdef GGML_USE_CUBLAS
            params.main_gpu = std::stoi(argv[i]);
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
@@ -376,13 +385,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                }
            }
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = false;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
@@ -402,10 +417,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
-        } else if (arg == "--perplexity-lines") {
-            params.perplexity_lines = true;
+        } else if (arg == "--ppl-stride") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.ppl_stride = std::stoi(argv[i]);
+        } else if (arg == "--ppl-output-type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.ppl_output_type = std::stoi(argv[i]);
+        } else if (arg == "--hellaswag") {
+            params.hellaswag = true;
+        } else if (arg == "--hellaswag-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
+            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
            params.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
@@ -524,11 +557,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stdout, "  -f FNAME, --file FNAME\n");
    fprintf(stdout, "                        prompt file to start generation.\n");
-    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@@ -548,19 +579,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
-    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
+    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
-    fprintf(stdout, "  --perplexity-lines    compute perplexity over each line of the prompt\n");
-    fprintf(stdout, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -576,12 +611,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        number of layers to store in VRAM\n");
    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
-    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+#ifdef GGML_USE_CUBLAS
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_CUBLAS
 #endif
    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
    fprintf(stdout, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
@@ -608,28 +649,20 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    return "The";
 }

-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
+//
+// Model utils
+//

 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
-    lparams.n_gqa           = params.n_gqa;
-    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
    lparams.low_vram        = params.low_vram;
+    lparams.mul_mat_q       = params.mul_mat_q;
    lparams.seed            = params.seed;
    lparams.f16_kv          = params.memory_f16;
    lparams.use_mmap        = params.use_mmap;
@@ -642,7 +675,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    return lparams;
 }

-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto lparams = llama_context_params_from_gpt_params(params);

    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
@@ -671,378 +704,78 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        }
    }

+    if (params.ignore_eos) {
+        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+    }
+
    return std::make_tuple(model, lctx);
 }

-void console_init(console_state & con_st) {
-#if defined(_WIN32)
-    // Windows-specific console initialization
-    DWORD dwMode = 0;
-    con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-    if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
-        con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
-        if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
-            con_st.hConsole = NULL;
-        }
-    }
-    if (con_st.hConsole) {
-        // Enable ANSI colors on Windows 10+
-        if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-            SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
-        }
-        // Set console output codepage to UTF8
-        SetConsoleOutputCP(CP_UTF8);
-    }
-    HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
-    if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF16
-        _setmode(_fileno(stdin), _O_WTEXT);
+//
+// Vocab utils
+//

-        // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
-        dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-        SetConsoleMode(hConIn, dwMode);
-    }
-#else
-    // POSIX-specific console initialization
-    struct termios new_termios;
-    tcgetattr(STDIN_FILENO, &con_st.prev_state);
-    new_termios = con_st.prev_state;
-    new_termios.c_lflag &= ~(ICANON | ECHO);
-    new_termios.c_cc[VMIN] = 1;
-    new_termios.c_cc[VTIME] = 0;
-    tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
-
-    con_st.tty = fopen("/dev/tty", "w+");
-    if (con_st.tty != nullptr) {
-        con_st.out = con_st.tty;
-    }
-
-    setlocale(LC_ALL, "");
-#endif
-}
-
-void console_cleanup(console_state & con_st) {
-    // Reset console color
-    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-#if !defined(_WIN32)
-    if (con_st.tty != nullptr) {
-        con_st.out = stdout;
-        fclose(con_st.tty);
-        con_st.tty = nullptr;
-    }
-    // Restore the terminal settings on POSIX systems
-    tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
-#endif
-}
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-void console_set_color(console_state & con_st, console_color_t color) {
-    if (con_st.use_color && con_st.color != color) {
-        fflush(stdout);
-        switch(color) {
-            case CONSOLE_COLOR_DEFAULT:
-                fprintf(con_st.out, ANSI_COLOR_RESET);
-                break;
-            case CONSOLE_COLOR_PROMPT:
-                fprintf(con_st.out, ANSI_COLOR_YELLOW);
-                break;
-            case CONSOLE_COLOR_USER_INPUT:
-                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
-                break;
-            case CONSOLE_COLOR_ERROR:
-                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
-                break;
-        }
-        con_st.color = color;
-        fflush(con_st.out);
-    }
-}
-
-char32_t getchar32() {
-#if defined(_WIN32)
-    HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
-    wchar_t high_surrogate = 0;
-
-    while (true) {
-        INPUT_RECORD record;
-        DWORD count;
-        if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
-            return WEOF;
-        }
-
-        if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
-            wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
-            if (wc == 0) {
-                continue;
-            }
-
-            if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-                high_surrogate = wc;
-                continue;
-            } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
-                if (high_surrogate != 0) { // Check if we have a high surrogate
-                    return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
-                }
-            }
-
-            high_surrogate = 0; // Reset the high surrogate
-            return static_cast<char32_t>(wc);
-        }
-    }
-#else
-    wchar_t wc = getwchar();
-    if (static_cast<wint_t>(wc) == WEOF) {
-        return WEOF;
-    }
-
-#if WCHAR_MAX == 0xFFFF
-    if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-        wchar_t low_surrogate = getwchar();
-        if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
-            return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
-        }
-    }
-    if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
-        return 0xFFFD; // Return the replacement character U+FFFD
-    }
-#endif
-
-    return static_cast<char32_t>(wc);
-#endif
-}
-
-void pop_cursor(console_state & con_st) {
-#if defined(_WIN32)
-    if (con_st.hConsole != NULL) {
-        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-        GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
-
-        COORD newCursorPosition = bufferInfo.dwCursorPosition;
-        if (newCursorPosition.X == 0) {
-            newCursorPosition.X = bufferInfo.dwSize.X - 1;
-            newCursorPosition.Y -= 1;
-        } else {
-            newCursorPosition.X -= 1;
-        }
-
-        SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
-        return;
-    }
-#endif
-    putc('\b', con_st.out);
-}
-
-int estimateWidth(char32_t codepoint) {
-#if defined(_WIN32)
-    return 1;
-#else
-    return wcwidth(codepoint);
-#endif
-}
-
-int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
-#if defined(_WIN32)
-    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-    if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
-        // go with the default
-        return expectedWidth;
-    }
-    COORD initialPosition = bufferInfo.dwCursorPosition;
-    DWORD nNumberOfChars = length;
-    WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
-
-    CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
-    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-
-    // Figure out our real position if we're in the last column
-    if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
-        DWORD nNumberOfChars;
-        WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
-        GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-    }
-
-    int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
-    if (width < 0) {
-        width += newBufferInfo.dwSize.X;
-    }
-    return width;
-#else
-    // we can trust expectedWidth if we've got one
-    if (expectedWidth >= 0 || con_st.tty == nullptr) {
-        fwrite(utf8_codepoint, length, 1, con_st.out);
-        return expectedWidth;
-    }
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    int x1, x2, y1, y2;
-    int results = 0;
-    results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
-
-    fwrite(utf8_codepoint, length, 1, con_st.tty);
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
-
-    if (results != 4) {
-        return expectedWidth;
-    }
-
-    int width = x2 - x1;
-    if (width < 0) {
-        // Calculate the width considering text wrapping
-        struct winsize w;
-        ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
-        width += w.ws_col;
-    }
-    return width;
-#endif
-}
-
-void replace_last(console_state & con_st, char ch) {
-#if defined(_WIN32)
-    pop_cursor(con_st);
-    put_codepoint(con_st, &ch, 1, 1);
-#else
-    fprintf(con_st.out, "\b%c", ch);
-#endif
-}
-
-void append_utf8(char32_t ch, std::string & out) {
-    if (ch <= 0x7F) {
-        out.push_back(static_cast<unsigned char>(ch));
-    } else if (ch <= 0x7FF) {
-        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0xFFFF) {
-        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0x10FFFF) {
-        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        GGML_ASSERT(check == -n_tokens);
    } else {
-        // Invalid Unicode code point
+        result.resize(n_tokens);
    }
+    return result;
 }

-// Helper function to remove the last UTF-8 character from a string
-void pop_back_utf8_char(std::string & line) {
-    if (line.empty()) {
-        return;
-    }
-
-    size_t pos = line.length() - 1;
-
-    // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-    for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-        if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
-    }
-    line.erase(pos);
-}
-
-bool console_readline(console_state & con_st, std::string & line) {
-    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-    if (con_st.out != stdout) {
-        fflush(stdout);
-    }
-
-    line.clear();
-    std::vector<int> widths;
-    bool is_special_char = false;
-    bool end_of_stream = false;
-
-    char32_t input_char;
-    while (true) {
-        fflush(con_st.out); // Ensure all output is displayed before waiting for input
-        input_char = getchar32();
-
-        if (input_char == '\r' || input_char == '\n') {
-            break;
-        }
-
-        if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
-            end_of_stream = true;
-            break;
-        }
-
-        if (is_special_char) {
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            replace_last(con_st, line.back());
-            is_special_char = false;
-        }
-
-        if (input_char == '\033') { // Escape sequence
-            char32_t code = getchar32();
-            if (code == '[' || code == 0x1B) {
-                // Discard the rest of the escape sequence
-                while ((code = getchar32()) != (char32_t) WEOF) {
-                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
-                        break;
-                    }
-                }
-            }
-        } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-            if (!widths.empty()) {
-                int count;
-                do {
-                    count = widths.back();
-                    widths.pop_back();
-                    // Move cursor back, print space, and move cursor back again
-                    for (int i = 0; i < count; i++) {
-                        replace_last(con_st, ' ');
-                        pop_cursor(con_st);
-                    }
-                    pop_back_utf8_char(line);
-                } while (count == 0 && !widths.empty());
-            }
-        } else {
-            int offset = line.length();
-            append_utf8(input_char, line);
-            int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-            if (width < 0) {
-                width = 0;
-            }
-            widths.push_back(width);
-        }
-
-        if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-            replace_last(con_st, line.back());
-            is_special_char = true;
-        }
-    }
-
-    bool has_more = con_st.multiline_input;
-    if (is_special_char) {
-        replace_last(con_st, ' ');
-        pop_cursor(con_st);
-
-        char last = line.back();
-        line.pop_back();
-        if (last == '\\') {
-            line += '\n';
-            fputc('\n', con_st.out);
-            has_more = !has_more;
-        } else {
-            // llama will just eat the single space, it won't act as a space
-            if (line.length() == 1 && line.back() == ' ') {
-                line.clear();
-                pop_cursor(con_st);
-            }
-            has_more = false;
-        }
+std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -n_tokens);
    } else {
-        if (end_of_stream) {
-            has_more = false;
-        } else {
-            line += '\n';
-            fputc('\n', con_st.out);
-        }
+        result.resize(n_tokens);
    }

-    fflush(con_st.out);
-    return has_more;
+    return std::string(result.data(), result.size());
+}
+
+std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    const llama_token bos_id = llama_token_bos(ctx);
+
+    std::string piece;
+    std::string result;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        piece = llama_token_to_piece(ctx, tokens[i]);
+
+        // remove the leading space of the first non-BOS token
+        if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
+            piece = piece.substr(1);
+        }
+
+        result += piece;
+    }
+
+    return result;
+}
+
+std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::string piece;
+    std::string result;
+
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        piece = llama_token_to_piece(ctx, tokens[i]);
+
+        result += piece;
+    }
+
+    return result;
 }
--- a/examples/common.h
+++ b/examples/common.h
@@ -11,11 +11,6 @@
 #include <unordered_map>
 #include <tuple>

-#if !defined (_WIN32)
-#include <stdio.h>
-#include <termios.h>
-#endif
-
 //
 // CLI argument parsing
 //
@@ -27,19 +22,17 @@ struct gpt_params {
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
-    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
+    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor

    // sampling parameters
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
@@ -53,12 +46,14 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance

-    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@@ -70,7 +65,15 @@ struct gpt_params {
    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter

-    bool low_vram          = false;   // if true, reduce VRAM usage at the cost of performance
+    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                    //                                       (which is more convenient to use for plotting)
+                                    //
+    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+
+    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
@@ -81,12 +84,13 @@ struct gpt_params {
    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
+    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
-    bool perplexity_lines  = false; // compute perplexity over each line of the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
@@ -101,54 +105,42 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);

-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
 //
 // Model utils
 //

-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

 //
-// Console utils
+// Vocab utils
 //

-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
+// tokenizes a string into a vector of tokens
+// should work similar to Python's `tokenizer.encode`
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);

-enum console_color_t {
-    CONSOLE_COLOR_DEFAULT=0,
-    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT,
-    CONSOLE_COLOR_ERROR
-};
+// tokenizes a token into a piece
+// should work similar to Python's `tokenizer.id_to_piece`
+std::string llama_token_to_piece(
+        const struct llama_context * ctx,
+                       llama_token   token);

-struct console_state {
-    bool multiline_input = false;
-    bool use_color = false;
-    console_color_t color = CONSOLE_COLOR_DEFAULT;
+// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
+//       that takes into account the tokenizer type and decides how to handle the leading space
+//
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+// removes the leading space from the first non-BOS token
+std::string llama_detokenize_spm(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);

-    FILE* out = stdout;
-#if defined (_WIN32)
-    void* hConsole;
-#else
-    FILE* tty = nullptr;
-    termios prev_state;
-#endif
-};
-
-void console_init(console_state & con_st);
-void console_cleanup(console_state & con_st);
-void console_set_color(console_state & con_st, console_color_t color);
-bool console_readline(console_state & con_st, std::string & line);
+// detokenizes a vector of tokens into a string
+// should work similar to Python's `tokenizer.decode`
+std::string llama_detokenize_bpe(
+                         llama_context * ctx,
+        const std::vector<llama_token> & tokens);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -0,0 +1,500 @@
+#include "console.h"
+#include <vector>
+#include <iostream>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
+#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
+#endif
+#else
+#include <climits>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <termios.h>
+#endif
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+#define ANSI_BOLD          "\x1b[1m"
+
+namespace console {
+
+    //
+    // Console state
+    //
+
+    static bool      advanced_display = false;
+    static bool      simple_io        = true;
+    static display_t current_display  = reset;
+
+    static FILE*     out              = stdout;
+
+#if defined (_WIN32)
+    static void*     hConsole;
+#else
+    static FILE*     tty              = nullptr;
+    static termios   initial_state;
+#endif
+
+    //
+    // Init and cleanup
+    //
+
+    void init(bool use_simple_io, bool use_advanced_display) {
+        advanced_display = use_advanced_display;
+        simple_io = use_simple_io;
+#if defined(_WIN32)
+        // Windows-specific console initialization
+        DWORD dwMode = 0;
+        hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
+            hConsole = GetStdHandle(STD_ERROR_HANDLE);
+            if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
+                hConsole = nullptr;
+                simple_io = true;
+            }
+        }
+        if (hConsole) {
+            // Check conditions combined to reduce nesting
+            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
+                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+                advanced_display = false;
+            }
+            // Set console output codepage to UTF8
+            SetConsoleOutputCP(CP_UTF8);
+        }
+        HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
+        if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
+            // Set console input codepage to UTF16
+            _setmode(_fileno(stdin), _O_WTEXT);
+
+            // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
+            if (simple_io) {
+                dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
+            } else {
+                dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+            }
+            if (!SetConsoleMode(hConIn, dwMode)) {
+                simple_io = true;
+            }
+        }
+#else
+        // POSIX-specific console initialization
+        if (!simple_io) {
+            struct termios new_termios;
+            tcgetattr(STDIN_FILENO, &initial_state);
+            new_termios = initial_state;
+            new_termios.c_lflag &= ~(ICANON | ECHO);
+            new_termios.c_cc[VMIN] = 1;
+            new_termios.c_cc[VTIME] = 0;
+            tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
+
+            tty = fopen("/dev/tty", "w+");
+            if (tty != nullptr) {
+                out = tty;
+            }
+        }
+
+        setlocale(LC_ALL, "");
+#endif
+    }
+
+    void cleanup() {
+        // Reset console display
+        set_display(reset);
+
+#if !defined(_WIN32)
+        // Restore settings on POSIX systems
+        if (!simple_io) {
+            if (tty != nullptr) {
+                out = stdout;
+                fclose(tty);
+                tty = nullptr;
+            }
+            tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
+        }
+#endif
+    }
+
+    //
+    // Display and IO
+    //
+
+    // Keep track of current display and only emit ANSI code if it changes
+    void set_display(display_t display) {
+        if (advanced_display && current_display != display) {
+            fflush(stdout);
+            switch(display) {
+                case reset:
+                    fprintf(out, ANSI_COLOR_RESET);
+                    break;
+                case prompt:
+                    fprintf(out, ANSI_COLOR_YELLOW);
+                    break;
+                case user_input:
+                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
+                    break;
+                case error:
+                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
+            }
+            current_display = display;
+            fflush(out);
+        }
+    }
+
+    char32_t getchar32() {
+#if defined(_WIN32)
+        HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+        wchar_t high_surrogate = 0;
+
+        while (true) {
+            INPUT_RECORD record;
+            DWORD count;
+            if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+                return WEOF;
+            }
+
+            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+                if (wc == 0) {
+                    continue;
+                }
+
+                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+                    high_surrogate = wc;
+                    continue;
+                }
+                if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+                    if (high_surrogate != 0) { // Check if we have a high surrogate
+                        return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+                    }
+                }
+
+                high_surrogate = 0; // Reset the high surrogate
+                return static_cast<char32_t>(wc);
+            }
+        }
+#else
+        wchar_t wc = getwchar();
+        if (static_cast<wint_t>(wc) == WEOF) {
+            return WEOF;
+        }
+
+#if WCHAR_MAX == 0xFFFF
+        if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+            wchar_t low_surrogate = getwchar();
+            if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
+                return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
+            }
+        }
+        if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
+            return 0xFFFD; // Return the replacement character U+FFFD
+        }
+#endif
+
+        return static_cast<char32_t>(wc);
+#endif
+    }
+
+    void pop_cursor() {
+#if defined(_WIN32)
+        if (hConsole != NULL) {
+            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
+
+            COORD newCursorPosition = bufferInfo.dwCursorPosition;
+            if (newCursorPosition.X == 0) {
+                newCursorPosition.X = bufferInfo.dwSize.X - 1;
+                newCursorPosition.Y -= 1;
+            } else {
+                newCursorPosition.X -= 1;
+            }
+
+            SetConsoleCursorPosition(hConsole, newCursorPosition);
+            return;
+        }
+#endif
+        putc('\b', out);
+    }
+
+    int estimateWidth(char32_t codepoint) {
+#if defined(_WIN32)
+        return 1;
+#else
+        return wcwidth(codepoint);
+#endif
+    }
+
+    int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
+#if defined(_WIN32)
+        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+        if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
+            // go with the default
+            return expectedWidth;
+        }
+        COORD initialPosition = bufferInfo.dwCursorPosition;
+        DWORD nNumberOfChars = length;
+        WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
+
+        CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
+        GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
+
+        // Figure out our real position if we're in the last column
+        if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
+            DWORD nNumberOfChars;
+            WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
+            GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
+        }
+
+        int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
+        if (width < 0) {
+            width += newBufferInfo.dwSize.X;
+        }
+        return width;
+#else
+        // We can trust expectedWidth if we've got one
+        if (expectedWidth >= 0 || tty == nullptr) {
+            fwrite(utf8_codepoint, length, 1, out);
+            return expectedWidth;
+        }
+
+        fputs("\033[6n", tty); // Query cursor position
+        int x1;
+        int y1;
+        int x2;
+        int y2;
+        int results = 0;
+        results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
+
+        fwrite(utf8_codepoint, length, 1, tty);
+
+        fputs("\033[6n", tty); // Query cursor position
+        results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
+
+        if (results != 4) {
+            return expectedWidth;
+        }
+
+        int width = x2 - x1;
+        if (width < 0) {
+            // Calculate the width considering text wrapping
+            struct winsize w;
+            ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+            width += w.ws_col;
+        }
+        return width;
+#endif
+    }
+
+    void replace_last(char ch) {
+#if defined(_WIN32)
+        pop_cursor();
+        put_codepoint(&ch, 1, 1);
+#else
+        fprintf(out, "\b%c", ch);
+#endif
+    }
+
+    void append_utf8(char32_t ch, std::string & out) {
+        if (ch <= 0x7F) {
+            out.push_back(static_cast<unsigned char>(ch));
+        } else if (ch <= 0x7FF) {
+            out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else if (ch <= 0xFFFF) {
+            out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else if (ch <= 0x10FFFF) {
+            out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else {
+            // Invalid Unicode code point
+        }
+    }
+
+    // Helper function to remove the last UTF-8 character from a string
+    void pop_back_utf8_char(std::string & line) {
+        if (line.empty()) {
+            return;
+        }
+
+        size_t pos = line.length() - 1;
+
+        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+            if ((line[pos] & 0xC0) != 0x80) {
+                break; // Found the start of the character
+            }
+        }
+        line.erase(pos);
+    }
+
+    bool readline_advanced(std::string & line, bool multiline_input) {
+        if (out != stdout) {
+            fflush(stdout);
+        }
+
+        line.clear();
+        std::vector<int> widths;
+        bool is_special_char = false;
+        bool end_of_stream = false;
+
+        char32_t input_char;
+        while (true) {
+            fflush(out); // Ensure all output is displayed before waiting for input
+            input_char = getchar32();
+
+            if (input_char == '\r' || input_char == '\n') {
+                break;
+            }
+
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
+                end_of_stream = true;
+                break;
+            }
+
+            if (is_special_char) {
+                set_display(user_input);
+                replace_last(line.back());
+                is_special_char = false;
+            }
+
+            if (input_char == '\033') { // Escape sequence
+                char32_t code = getchar32();
+                if (code == '[' || code == 0x1B) {
+                    // Discard the rest of the escape sequence
+                    while ((code = getchar32()) != (char32_t) WEOF) {
+                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
+                            break;
+                        }
+                    }
+                }
+            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
+                if (!widths.empty()) {
+                    int count;
+                    do {
+                        count = widths.back();
+                        widths.pop_back();
+                        // Move cursor back, print space, and move cursor back again
+                        for (int i = 0; i < count; i++) {
+                            replace_last(' ');
+                            pop_cursor();
+                        }
+                        pop_back_utf8_char(line);
+                    } while (count == 0 && !widths.empty());
+                }
+            } else {
+                int offset = line.length();
+                append_utf8(input_char, line);
+                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+                if (width < 0) {
+                    width = 0;
+                }
+                widths.push_back(width);
+            }
+
+            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+                set_display(prompt);
+                replace_last(line.back());
+                is_special_char = true;
+            }
+        }
+
+        bool has_more = multiline_input;
+        if (is_special_char) {
+            replace_last(' ');
+            pop_cursor();
+
+            char last = line.back();
+            line.pop_back();
+            if (last == '\\') {
+                line += '\n';
+                fputc('\n', out);
+                has_more = !has_more;
+            } else {
+                // llama will just eat the single space, it won't act as a space
+                if (line.length() == 1 && line.back() == ' ') {
+                    line.clear();
+                    pop_cursor();
+                }
+                has_more = false;
+            }
+        } else {
+            if (end_of_stream) {
+                has_more = false;
+            } else {
+                line += '\n';
+                fputc('\n', out);
+            }
+        }
+
+        fflush(out);
+        return has_more;
+    }
+
+    bool readline_simple(std::string & line, bool multiline_input) {
+#if defined(_WIN32)
+        std::wstring wline;
+        if (!std::getline(std::wcin, wline)) {
+            // Input stream is bad or EOF received
+            line.clear();
+            GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
+            return false;
+        }
+
+        int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
+        line.resize(size_needed);
+        WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
+#else
+        if (!std::getline(std::cin, line)) {
+            // Input stream is bad or EOF received
+            line.clear();
+            return false;
+        }
+#endif
+        if (!line.empty()) {
+            char last = line.back();
+            if (last == '/') { // Always return control on '/' symbol
+                line.pop_back();
+                return false;
+            }
+            if (last == '\\') { // '\\' changes the default action
+                line.pop_back();
+                multiline_input = !multiline_input;
+            }
+        }
+        line += '\n';
+
+        // By default, continue input if multiline_input is set
+        return multiline_input;
+    }
+
+    bool readline(std::string & line, bool multiline_input) {
+        set_display(user_input);
+
+        if (simple_io) {
+            return readline_simple(line, multiline_input);
+        }
+        return readline_advanced(line, multiline_input);
+    }
+
+}
--- a/common/console.h
+++ b/common/console.h
@@ -0,0 +1,19 @@
+// Console functions
+
+#pragma once
+
+#include <string>
+
+namespace console {
+    enum display_t {
+        reset = 0,
+        prompt,
+        user_input,
+        error
+    };
+
+    void init(bool use_simple_io, bool use_advanced_display);
+    void cleanup();
+    void set_display(display_t display);
+    bool readline(std::string & line, bool multiline_input);
+}
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@@ -405,7 +405,7 @@ namespace grammar_parser {
            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
                // fprintf(file, "%zu: ", i);
                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, i, state.rules[i], symbol_id_names);
+                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
                // fprintf(file, "\n");
            }
        } catch (const std::exception & err) {
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+# HF falcon--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "RWForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.FALCON
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name("Falcon")
+gguf_writer.add_context_length(2048) # not in config.json
+gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+if "n_head_kv" in hparams:
+    gguf_writer.add_head_count_kv(hparams["n_head_kv"])
+else:
+    gguf_writer.add_head_count_kv(1)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+scores: List[float] = []
+toktypes: List[int] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+        scores.append(0.0)                      # dymmy
+        toktypes.append(gguf.TokenType.NORMAL)  # dummy
+
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+print("gguf: get special token ids")
+# Look for special tokens in config.json
+
+if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+    gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+    gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+    gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+    gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+    gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# params for qkv transform
+n_head    = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+
+head_dim = hparams["hidden_size"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        # QKV tensor transform
+        # The original query_key_value tensor contains n_head_kv "kv groups",
+        # each consisting of n_head/n_head_kv query weights followed by one key
+        # and one value weight (shared by all query heads in the kv group).
+        # This layout makes it a big pain to work with in GGML.
+        # So we rearrange them here,, so that we have n_head query weights
+        # followed by n_head_kv key weights followed by n_head_kv value weights,
+        # in contiguous fashion.
+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+        if "query_key_value" in name:
+            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            data = torch.cat((q,k,v)).reshape_as(data)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+# HF gptneox--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.GPTNEOX
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+# 7b pth llama --> gguf conversion
+# Only models with a single datafile are supported, like 7B
+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("consolidated."):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+if num_parts > 1:
+    print("gguf: Only models with a single datafile are supported.")
+
+    sys.exit()
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab and scores")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name == "rope.freqs":
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+import sys, struct, math, argparse
+from pathlib import Path
+
+import numpy as np
+
+import gguf
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    gguf.GGMLQuantizationType.F32  : (1, 4),
+    gguf.GGMLQuantizationType.F16  : (1, 2),
+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}
+
+class Hyperparameters:
+    def __init__(self):
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+        self.n_ff = 0
+
+    def set_n_ff(self, model):
+        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
+        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
+        ff_tensor = model.tensors[ff_tensor_idx]
+        self.n_ff = ff_tensor.dims[1]
+
+    def load(self, data, offset):
+        (
+            self.n_vocab,
+            self.n_embd,
+            self.n_mult,
+            self.n_head,
+            self.n_layer,
+            self.n_rot,
+            self.ftype,
+        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+        return 4 * 7
+
+    def __str__(self):
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
+
+class Vocab:
+    def __init__(self):
+        self.items = []
+
+    def load(self, data, offset, n_vocab):
+        orig_offset = offset
+        for _ in range(n_vocab):
+            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
+            assert itemlen < 4096, 'Absurd vocab item length'
+            offset += 4
+            vocab = bytes(data[offset:offset + itemlen])
+            offset += itemlen
+            score = struct.unpack('<f', data[offset:offset + 4])[0]
+            offset += 4
+            self.items.append((vocab, score))
+        return offset - orig_offset
+
+class Tensor:
+    def __init__(self):
+        self.name = None
+        self.dims = ()
+        self.dtype = None
+        self.start_offset = 0
+        self.len_bytes = 0
+
+    def load(self, data, offset):
+        orig_offset = offset
+        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
+        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
+        assert name_len < 4096, 'Absurd tensor name length'
+        quant = GGML_QUANT_SIZES.get(dtype)
+        assert quant is not None, 'Unknown tensor type'
+        (blksize, tysize) = quant
+        offset += 12
+        self.dtype= dtype
+        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
+        offset += 4 * n_dims
+        self.name = bytes(data[offset:offset + name_len])
+        offset += name_len
+        pad = ((offset + 31) & ~31) - offset
+        offset += pad
+        n_elems = np.prod(self.dims)
+        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
+        self.start_offset = offset
+        self.len_bytes = n_bytes
+        offset += n_bytes
+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
+        return offset - orig_offset
+
+class GGMLV3Model:
+    def __init__(self):
+        self.hyperparameters = None
+        self.vocab = None
+        self.tensor_map = {}
+        self.tensors = []
+
+    def validate_header(self, data, offset):
+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
+            raise ValueError('Only GGJTv3 supported')
+        return 8
+
+    def load(self, data, offset):
+        offset += self.validate_header(data, offset)
+        hp = Hyperparameters()
+        offset += hp.load(data, offset)
+        vocab = Vocab()
+        offset += vocab.load(data, offset, hp.n_vocab)
+        tensors = []
+        tensor_map = {}
+        while offset < len(data):
+            tensor = Tensor()
+            offset += tensor.load(data, offset)
+            tensor_map[tensor.name] = len(tensors)
+            tensors.append(tensor)
+        self.hyperparameters = hp
+        self.vocab = vocab
+        self.tensors = tensors
+        self.tensor_map = tensor_map
+        hp.set_n_ff(self)
+        return offset
+
+class GGMLToGGUF:
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
+        hp = ggml_model.hyperparameters
+        self.model = ggml_model
+        self.data = data
+        self.cfg = cfg
+        self.params_override = params_override
+        self.vocab_override = vocab_override
+        if params_override is not None:
+            n_kv_head = params_override.n_head_kv
+        else:
+            if cfg.gqa == 1:
+                n_kv_head = hp.n_head
+            else:
+                gqa = float(cfg.gqa)
+                n_kv_head = None
+                for x in range(1, 256):
+                    if float(hp.n_head) / float(x) == gqa:
+                        n_kv_head = x
+                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+        self.n_kv_head = n_kv_head
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+
+    def save(self):
+        print('* Preparing to save GGUF file')
+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+        self.add_params(gguf_writer)
+        self.add_vocab(gguf_writer)
+        self.add_tensors(gguf_writer)
+        print("    gguf: write header")
+        gguf_writer.write_header_to_file()
+        print("    gguf: write metadata")
+        gguf_writer.write_kv_data_to_file()
+        print("    gguf: write tensors")
+        gguf_writer.write_tensors_to_file()
+        gguf_writer.close()
+
+    def add_params(self, gguf_writer):
+        hp = self.model.hyperparameters
+        cfg = self.cfg
+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
+        try:
+            # Filenames aren't necessarily valid UTF8.
+            name = cfg.name if cfg.name is not None else cfg.input.name
+        except UnicodeDecodeError:
+            name = None
+        print('* Adding model parameters and KV items')
+        if name is not None:
+            gguf_writer.add_name(name)
+        gguf_writer.add_description(desc)
+        if self.params_override is not None:
+            po = self.params_override
+            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
+            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
+            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
+            gguf_writer.add_context_length      (po.n_ctx)
+            gguf_writer.add_embedding_length    (po.n_embd)
+            gguf_writer.add_block_count         (po.n_layer)
+            gguf_writer.add_feed_forward_length (po.n_ff)
+            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
+            gguf_writer.add_head_count          (po.n_head)
+            gguf_writer.add_head_count_kv       (po.n_head_kv)
+            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
+            return
+        gguf_writer.add_context_length(cfg.context_length)
+        gguf_writer.add_embedding_length(hp.n_embd)
+        gguf_writer.add_block_count(hp.n_layer)
+        gguf_writer.add_feed_forward_length(hp.n_ff)
+        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
+        gguf_writer.add_head_count(hp.n_head)
+        gguf_writer.add_head_count_kv(self.n_kv_head)
+        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
+
+    def add_vocab(self, gguf_writer):
+        hp = self.model.hyperparameters
+        gguf_writer.add_tokenizer_model('llama')
+        tokens = []
+        scores = []
+        toktypes = []
+        if self.vocab_override is not None:
+            vo = self.vocab_override
+            print('* Adding vocab item(s)')
+            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+                tokens.append(vbytes)
+                scores.append(score)
+                toktypes.append(ttype)
+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            gguf_writer.add_token_list(tokens)
+            gguf_writer.add_token_scores(scores)
+            if len(toktypes) > 0:
+                gguf_writer.add_token_types(toktypes)
+            return
+        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
+        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
+            tt = 1 # Normal
+            # Special handling for UNK, BOS, EOS tokens.
+            if tokid <= 2:
+                if tokid == 0:
+                    vbytes = b'<unk>'
+                    tt = 2
+                elif tokid == 1:
+                    vbytes = b'<s>'
+                    tt = 3
+                else:
+                    vbytes = b'</s>'
+                    tt = 3
+            elif len(vbytes) == 0:
+                tt = 3 # Control
+            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
+                vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
+                tt = 6 # Byte
+            else:
+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+            toktypes.append(tt)
+            tokens.append(vbytes)
+            scores.append(vscore)
+        gguf_writer.add_token_list(tokens)
+        gguf_writer.add_token_scores(scores)
+        gguf_writer.add_token_types(toktypes)
+        gguf_writer.add_unk_token_id(0)
+        gguf_writer.add_bos_token_id(1)
+        gguf_writer.add_eos_token_id(2)
+
+    def add_tensors(self, gguf_writer):
+        nm = self.name_map
+        data = self.data
+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        for tensor in self.model.tensors:
+            name = str(tensor.name, 'UTF-8')
+            if name.endswith('.weight'):
+                name = name[:-7]
+                suffix = '.weight'
+            elif name.endswith('.bias'):
+                name = name[:-5]
+                suffix = '.bias'
+            mapped_name = nm.get(name)
+            assert mapped_name is not None, f'Bad name {name}'
+            mapped_name += suffix
+            tempdims = list(tensor.dims[:])
+            if len(tempdims) > 1:
+                temp = tempdims[1]
+                tempdims[1] = tempdims[0]
+                tempdims[0] = temp
+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+
+def handle_metadata(cfg, hp):
+    import convert
+    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
+    hf_config_path   = cfg.model_metadata_dir / "config.json"
+    orig_config_path = cfg.model_metadata_dir / "params.json"
+    # We pass a fake model here. "original" mode will check the shapes of some
+    # tensors if information is missing in the .json file: other than that, the
+    # model data isn't used so this should be safe (at least for now).
+    fakemodel = {
+        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+    }
+    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
+    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
+    if hf_config_path.exists():
+        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
+    elif orig_config_path.exists():
+        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
+    else:
+        raise ValueError('Unable to load metadata')
+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    convert.check_vocab_size(params, vocab)
+    return (params, vocab)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--name', help = 'Set model name')
+    parser.add_argument('--desc', help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(f'* Using config: {cfg}')
+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    data = np.memmap(cfg.input, mode = 'r')
+    model = GGMLV3Model()
+    print('* Scanning GGML input file')
+    offset = model.load(data, 0)
+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    vocab_override = None
+    params_override = None
+    if cfg.model_metadata_dir is not None:
+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        print(f'* Overriding params: {params_override}')
+        print(f'* Overriding vocab: {vocab_override}')
+    else:
+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+    converter.save()
+    print(f'* Successful completion. Output saved to: {cfg.output}')
+
+if __name__ == '__main__':
+    main()
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+# HF llama --> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List, Optional
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # reverse permute these
+        if name.endswith(".q_proj.weight"):
+            data = reverse_hf_permute(data, head_count)
+        if name.endswith(".k_proj.weight"):
+            data = reverse_hf_permute(data, head_count, head_count_kv)
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import json
 import os
 import re
@@ -6,23 +6,22 @@ import struct
 import sys
 from typing import Any, Dict, Sequence, TextIO

+import numpy as np
 import torch

-from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
+NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
+

 HF_SUBLAYER_TO_GGML = {
-    "self_attn.q_proj": "attention.wq",
-    "self_attn.k_proj": "attention.wk",
-    "self_attn.v_proj": "attention.wv",
-    "self_attn.o_proj": "attention.wo",
-    "mlp.gate_proj": "feed_forward.w1",
-    "mlp.down_proj": "feed_forward.w2",
-    "mlp.up_proj": "feed_forward.w3",
-    "input_layernorm": "attention_norm",
+    "self_attn.q_proj": "attn_q",
+    "self_attn.k_proj": "attn_k",
+    "self_attn.v_proj": "attn_v",
+    "self_attn.o_proj": "attn_output",
+    "mlp.gate_proj": "ffn_gate",
+    "mlp.down_proj": "ffn_down",
+    "mlp.up_proj": "ffn_up",
+    "input_layernorm": "attn_norm",
    "post_attention_layernorm": "ffn_norm",
-    # "norm": "norm",
-    # "embed_tokens": "tok_embeddings",
-    # "lm_head": "output",
 }


@@ -39,7 +38,7 @@ def translate_tensor_name(t: str) -> str:
            sys.exit(1)

        output_string = (
-            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
        )
        return output_string
    else:
@@ -54,12 +53,14 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
    # but some models ship a float value instead
    # let's convert to int, but fail if lossless conversion is not possible
-    assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
+    assert (
+        int(params["lora_alpha"]) == params["lora_alpha"]
+    ), "cannot convert float to int losslessly"
    fout.write(struct.pack("i", int(params["lora_alpha"])))


 def write_tensor_header(
-    self, name: str, shape: Sequence[int], data_type: DataType
+    self, name: str, shape: Sequence[int], data_type: np.dtype
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
@@ -67,7 +68,7 @@ def write_tensor_header(
            "iii",
            len(shape),
            len(sname),
-            DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
+            NUMPY_TYPE_TO_FTYPE[data_type.name],
        )
    )
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,13 +0,0 @@
-# Compatibility stub
-
-import argparse
-
-import convert
-
-parser = argparse.ArgumentParser(
-    description="""[DEPRECATED - use `convert.py` instead]
-    Convert a LLaMA model checkpoint to a ggml compatible file""")
-parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-args = parser.parse_args()
-convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with cuBLAS
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```

 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 CPU: 7 physical cores
 RAM: 32GB

-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)

-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`

 Result:

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,25 +6,6 @@ find_package(Threads REQUIRED)

 # ...

-# common
-
-set(TARGET common)
-
-add_library(${TARGET} OBJECT
-    common.h
-    common.cpp
-    grammar-parser.h
-    grammar-parser.cpp
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
-
 # examples

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -40,8 +21,11 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(embd-input)
+    add_subdirectory(llama-bench)
+    add_subdirectory(beam_search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/beam_search/CMakeLists.txt
+++ b/examples/beam_search/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET beam_search)
+add_executable(${TARGET} beam_search.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/beam_search/beam_search.cpp
+++ b/examples/beam_search/beam_search.cpp
@@ -0,0 +1,188 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "common.h"
+#include "llama.h"
+#include "build-info.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
+#include <signal.h>
+#endif
+
+// Used for debugging to print out beam tokens.
+struct ostream_beam_view {
+    llama_context * ctx;
+    llama_beam_view beam_view;
+};
+std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
+    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
+    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
+        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
+    }
+    return os << ')';
+}
+
+// Put here anything you want back in beam_search_callback().
+struct beam_search_callback_data {
+    llama_context * ctx;
+    std::vector<llama_token> response;
+};
+
+// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
+// For example, eob can be flagged due to maximum token length, stop words, etc.
+bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
+    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        callback_data.response.resize(callback_data.response.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        std::copy(tokens, tokens + n, callback_data.response.end() - n);
+        printf("%lu", n);
+    }
+    fflush(stdout);
+#if 1 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+
+int main(int argc, char ** argv)
+{
+    gpt_params params;
+    //params.n_gpu_layers = 200;
+
+    //---------------------------------
+    // Print help :
+    //---------------------------------
+
+    if ( argc < 2 || argv[1][0] == '-' )
+    {
+        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
+        return 1 ;
+    }
+
+    //---------------------------------
+    // Load parameters :
+    //---------------------------------
+
+    params.model = argv[1];
+
+    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
+
+    if ( argc > 3 )
+    {
+        params.prompt = argv[3];
+    }
+
+    if ( params.prompt.empty() )
+    {
+        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
+    }
+
+    //---------------------------------
+    // Init LLM :
+    //---------------------------------
+
+    llama_backend_init(params.numa);
+
+    llama_model * model;
+    llama_context * ctx;
+
+    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+
+    if ( model == NULL )
+    {
+        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+        return 1;
+    }
+
+    //---------------------------------
+    // Tokenize the prompt :
+    //---------------------------------
+
+    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
+
+    const size_t max_context_size     = llama_n_ctx( ctx );
+    const size_t max_tokens_list_size = max_context_size - 4 ;
+
+    if (tokens_list.size() > max_tokens_list_size)
+    {
+        fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
+             __func__ , tokens_list.size() , max_tokens_list_size );
+        return 1;
+    }
+
+    fprintf( stderr, "\n\n" );
+
+    // Print the tokens from the prompt :
+
+    for( auto id : tokens_list )
+    {
+        std::cout << llama_token_to_piece(ctx, id);
+    }
+    std::cout << std::flush;
+
+    int n_past = llama_get_kv_cache_token_count(ctx);
+    if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
+    {
+        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
+        return 1;
+    }
+    n_past += tokens_list.size();
+
+    beam_search_callback_data callback_data{ctx, {}};
+    size_t const beam_width = static_cast<size_t>(params.n_beams);
+    int const n_predict = 256;
+    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
+
+    std::cout << "\n\n";
+    for (llama_token const token_id : callback_data.response) {
+        std::cout << llama_token_to_piece(ctx,token_id);
+    }
+    std::cout << std::endl;
+
+    llama_free( ctx );
+    llama_free_model( model );
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET convert-llama2c-to-ggml)
+add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -0,0 +1,26 @@
+## Convert llama2.c model to ggml
+
+This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
+
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+
+`$ make -j`
+
+After successful compilation, following usage options are available:
+```
+usage: ./convert-llama2c-to-ggml [options]
+
+options:
+  -h, --help                       show this help message and exit
+  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default 'models/7B/ggml-model-f16.gguf')
+  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
+  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
+```
+
+An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
+
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin`
+
+Now you can use the model with a command like:
+
+`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -0,0 +1,959 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <sstream>
+#include <algorithm>
+#include <string>
+
+// GGUF keys & tensor names.
+
+#define KV_GENERAL_ARCHITECTURE          "general.architecture"
+#define KV_GENERAL_NAME                  "general.name"
+
+#define KV_TOKENIZER_MODEL               "tokenizer.ggml.model"
+#define KV_TOKENIZER_LIST                "tokenizer.ggml.tokens"
+#define KV_TOKENIZER_TOKEN_TYPE          "tokenizer.ggml.token_type"
+#define KV_TOKENIZER_SCORES              "tokenizer.ggml.scores"
+#define KV_TOKENIZER_BOS_ID              "tokenizer.ggml.bos_token_id"
+#define KV_TOKENIZER_EOS_ID              "tokenizer.ggml.eos_token_id"
+#define KV_TOKENIZER_UNK_ID              "tokenizer.ggml.unknown_token_id"
+#define KV_TOKENIZER_SEP_ID              "tokenizer.ggml.seperator_token_id"
+#define KV_TOKENIZER_PAD_ID              "tokenizer.ggml.padding_token_id"
+#define KV_TOKENIZER_HF_JSON             "tokenizer.huggingface.json"
+
+#define KV_CONTEXT_LENGTH                "llama.context_length"
+#define KV_EMBEDDING_LENGTH              "llama.embedding_length"
+#define KV_BLOCK_COUNT                   "llama.block_count"
+#define KV_FEED_FORWARD_LENGTH           "llama.feed_forward_length"
+#define KV_ATTENTION_HEAD_COUNT          "llama.attention.head_count"
+#define KV_ATTENTION_HEAD_COUNT_KV       "llama.attention.head_count_kv"
+#define KV_ATTENTION_LAYERNORM_RMS_EPS   "llama.attention.layer_norm_rms_epsilon"
+#define KV_ROPE_DIMENSION_COUNT          "llama.rope.dimension_count"
+
+#define TN_TOKEN_EMBD  "token_embd.weight"
+#define TN_OUTPUT_NORM "output_norm.weight"
+#define TN_OUTPUT      "output.weight"
+#define TN_ATTN_NORM   "blk.%d.attn_norm.weight"
+#define TN_ATTN_Q      "blk.%d.attn_q.weight"
+#define TN_ATTN_K      "blk.%d.attn_k.weight"
+#define TN_ATTN_V      "blk.%d.attn_v.weight"
+#define TN_ATTN_OUTPUT "blk.%d.attn_output.weight"
+#define TN_FFN_NORM    "blk.%d.ffn_norm.weight"
+#define TN_FFN_GATE    "blk.%d.ffn_gate.weight"
+#define TN_FFN_DOWN    "blk.%d.ffn_down.weight"
+#define TN_FFN_UP      "blk.%d.ffn_up.weight"
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_FILE_VERSION_GGJT_V3   3
+
+#define TOKENIZER_NAME "llama"
+#define UNKNOWN_TOKEN_ID 0
+#define BOS_TOKEN_ID 1
+#define EOS_TOKEN_ID 2
+
+//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
+    // (optional) classifier weights for the logits, on the last layer
+    float* wcls;
+} TransformerWeights;
+
+void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->rms_final_weight = new float[p->dim]();
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+
+    if (shared_weights) {
+        w->wcls = NULL;
+    } else {
+        w->wcls = new float[p->vocab_size * p->dim]();
+        printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+    }
+}
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+
+    // Skip freq_cis_real & freq_cis_imag
+    int head_size = p->dim / p->n_heads;
+    fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
+
+    if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+
+    // Check we didn't forget to read anything
+    auto curr = ftell(f);
+    fseek(f, 0, SEEK_END);
+    auto end = ftell(f);
+    if (curr != end) {
+        printf("Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", curr, end);
+        return 1;
+    }
+
+    return 0;
+}
+
+void free_weights(TransformerWeights* w) {
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
+    if (w->wcls) delete w->wcls;
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    if (w->wcls) printf("%f\n", w->wcls[0]);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+        token text;
+        float score;
+        ttype type;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_ff    = 11008;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    std::string name;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_llama2c_model;
+    const char * fn_llama2c_output_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, params->n_ff);
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = hparams.n_ff;
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+    std::float_t read_f32() {
+        std::float_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+bool is_ggml_file(const char *filename) {
+    llama_file file(filename, "rb");
+    if (file.size < 4) {
+        return false;
+    }
+    uint32_t magic = file.read_u32();
+    return magic == GGUF_MAGIC;
+}
+
+static std::string llama_escape_whitespaces(const std::string& text) {
+    std::ostringstream out;
+    for (char c : text) {
+        if (c == ' ') out << "\xe2\x96\x81";
+        else out << c;
+    }
+    return out.str();
+}
+
+void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+    if (is_ggml_file(filename)) {
+        struct ggml_context * ctx_data = NULL;
+
+        struct gguf_init_params params = {
+            /*.no_alloc = */ false,
+            /*.ctx      = */ &ctx_data,
+        };
+
+        struct gguf_context * ctx = gguf_init_from_file(filename, params);
+        GGML_ASSERT(ctx != NULL);
+
+        const int model_idx = gguf_find_key(ctx, KV_TOKENIZER_MODEL);
+        GGML_ASSERT(model_idx >= 0);
+        std::string tokenizer_name = gguf_get_val_str(ctx, model_idx);
+        GGML_ASSERT(tokenizer_name == TOKENIZER_NAME);
+
+        const int token_idx = gguf_find_key(ctx, KV_TOKENIZER_LIST);
+        GGML_ASSERT(token_idx >= 0);
+
+        const int score_idx = gguf_find_key(ctx, KV_TOKENIZER_SCORES);
+        GGML_ASSERT(score_idx >= 0);
+        const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
+
+        const int toktype_idx = gguf_find_key(ctx, KV_TOKENIZER_TOKEN_TYPE);
+        GGML_ASSERT(toktype_idx >= 0);
+        const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
+
+        const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
+
+        vocab->id_to_token.resize(n_vocab);
+
+        for (uint32_t i = 0; i < n_vocab; i++) {
+            std::string word = gguf_get_arr_str(ctx, token_idx, i);
+
+            vocab->token_to_id[word] = i;
+
+            auto & token_data = vocab->id_to_token[i];
+            token_data.text  = std::move(word);
+            token_data.score = scores[i];
+            token_data.type  = (llama_token_type) toktypes[i];
+        }
+        ggml_free(ctx_data);
+        gguf_free(ctx);
+    } else {
+        // assume llama2.c vocabulary
+        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
+        llama_file file(filename, "rb");
+        const int  n_vocab = config->vocab_size;
+        /* uint32_t max_token_length =  */ file.read_u32(); // unused
+        vocab->id_to_token.resize(n_vocab);
+        for (llama_vocab::id id=0; id<n_vocab; ++id) {
+            float_t score = file.read_f32();
+            uint32_t len = file.read_u32();
+            std::string text = file.read_string(len);
+
+            unsigned char byte_val;
+            llama_vocab::ttype type = LLAMA_TOKEN_TYPE_NORMAL;
+            if (id == UNKNOWN_TOKEN_ID) {
+                text = "<unk>";
+                type = LLAMA_TOKEN_TYPE_UNKNOWN;
+            } else if (id == BOS_TOKEN_ID) {
+                text = "<s>";
+                type = LLAMA_TOKEN_TYPE_CONTROL;
+            } else if (id == EOS_TOKEN_ID) {
+                text = "</s>";
+                type = LLAMA_TOKEN_TYPE_CONTROL;
+            } else if (text.empty()) {
+                type = LLAMA_TOKEN_TYPE_CONTROL;
+            } else if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
+                // Text of byte tokens is already in the expected format.
+                type = LLAMA_TOKEN_TYPE_BYTE;
+            } else {
+                type = LLAMA_TOKEN_TYPE_NORMAL;
+            }
+            text = llama_escape_whitespaces(text);
+
+            vocab->id_to_token[id].text = text;
+            vocab->id_to_token[id].score = score;
+            vocab->id_to_token[id].type = type;
+            vocab->token_to_id.emplace(text, id);
+        }
+    }
+}
+
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+    int ct;
+    switch (gg_weights->n_dims){
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+                ct++;
+            }
+            break;
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;
+    }
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    // stuff AK weights into GG weights one by one.
+    // w->token_embedding_table -> model->tok_embeddings
+    // float*                   -> struct ggml_tensor
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+    //print_row(model->norm, 0);
+
+    // for rms-att-weight
+    int row_length = model->hparams.n_embd;
+    const auto & hparams = model->hparams;
+    int n_ff = model->hparams.n_ff;
+
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+        auto & layer = model->layers[i];
+        // 1d
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+    }
+
+    struct gguf_context * ctx = gguf_init_empty();
+
+    std::vector<const char*> tokens;
+    std::vector<float> scores;
+    std::vector<llama_token_type> token_types;
+    for (const llama_vocab::token_data & token_data : vocab->id_to_token) {
+        tokens.push_back(token_data.text.c_str());
+        scores.push_back(token_data.score);
+        token_types.push_back(token_data.type);
+    }
+    gguf_set_arr_str(ctx, KV_TOKENIZER_LIST, tokens.data(), tokens.size());
+    gguf_set_arr_data(ctx, KV_TOKENIZER_SCORES, GGUF_TYPE_FLOAT32, scores.data(), scores.size());
+    gguf_set_arr_data(ctx, KV_TOKENIZER_TOKEN_TYPE, GGUF_TYPE_INT32, token_types.data(), token_types.size());
+
+    gguf_set_val_str(ctx, KV_TOKENIZER_MODEL, TOKENIZER_NAME);
+
+    gguf_set_val_str(ctx, KV_GENERAL_ARCHITECTURE, "llama");
+    gguf_set_val_str(ctx, KV_GENERAL_NAME, "llama");
+
+    // special tokens
+    gguf_set_val_u32(ctx, KV_TOKENIZER_UNK_ID, UNKNOWN_TOKEN_ID);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_BOS_ID, BOS_TOKEN_ID);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_EOS_ID, EOS_TOKEN_ID);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
+    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
+
+    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
+    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
+    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
+    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
+    // n_head_kv is optional, default to n_head
+    // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...);
+    gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer);
+    gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot);
+    gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f);
+
+    // write tensors
+    ggml_set_name(model->tok_embeddings, TN_TOKEN_EMBD);
+    gguf_add_tensor(ctx, model->tok_embeddings);
+
+    ggml_set_name(model->norm, TN_OUTPUT_NORM);
+    gguf_add_tensor(ctx, model->norm);
+
+    ggml_set_name(model->output, TN_OUTPUT);
+    gguf_add_tensor(ctx, model->output);
+
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        ggml_format_name(layer.wq, TN_ATTN_Q, i);
+        gguf_add_tensor(ctx, layer.wq);
+
+        ggml_format_name(layer.wk, TN_ATTN_K, i);
+        gguf_add_tensor(ctx, layer.wk);
+
+        ggml_format_name(layer.wv, TN_ATTN_V, i);
+        gguf_add_tensor(ctx, layer.wv);
+
+        ggml_format_name(layer.wo, TN_ATTN_OUTPUT, i);
+        gguf_add_tensor(ctx, layer.wo);
+
+        ggml_format_name(layer.attention_norm, TN_ATTN_NORM, i);
+        gguf_add_tensor(ctx, layer.attention_norm);
+
+        ggml_format_name(layer.w1, TN_FFN_GATE, i);
+        gguf_add_tensor(ctx, layer.w1);
+
+        ggml_format_name(layer.w2, TN_FFN_DOWN, i);
+        gguf_add_tensor(ctx, layer.w2);
+
+        ggml_format_name(layer.w3, TN_FFN_UP, i);
+        gguf_add_tensor(ctx, layer.w3);
+
+        ggml_format_name(layer.ffn_norm, TN_FFN_NORM, i);
+        gguf_add_tensor(ctx, layer.ffn_norm);
+    }
+
+    gguf_write_to_file(ctx, filename, false);
+    gguf_free(ctx);
+}
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
+    params.fn_llama2c_output_model = "ak_llama_model.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    path of gguf llama model or llama2.c vocabulary from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
+    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
+    fprintf(stderr, "\n");
+}
+
+bool params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    bool reqd_param_found = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--copy-vocab-from-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--llama2c-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            reqd_param_found = true;
+            params->fn_llama2c_model = argv[i];
+        } else if (arg == "--llama2c-output-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_llama2c_output_model = argv[i];
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (!reqd_param_found){
+        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+std::string basename(const std::string &path) {
+    size_t pos = path.find_last_of("/");
+    if (pos == std::string::npos) {
+        return path;
+    }
+    return path.substr(pos + 1);
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+    if (!params_parse(argc, argv, &params)) {
+        return 1;
+    }
+    Config config;
+    TransformerWeights weights;
+    {
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        auto shared_weights = config.vocab_size > 0;
+        config.vocab_size = abs(config.vocab_size);
+
+        // read in the Transformer weights
+        malloc_weights(&weights, &config, shared_weights);
+        if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
+        fclose(file);
+    }
+
+    struct llama_vocab vocab;
+    load_vocab(params.fn_vocab_model, &config, &vocab);
+
+    struct my_llama_model model;
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_ff    = config.hidden_dim;
+    model.hparams.n_mult  = 32;//params.n_mult;
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    print_params(&model.hparams);
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+
+    init_model(&model);
+    model.name = basename(params.fn_llama2c_model);
+    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
+
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+
+    ggml_free(model.ctx);
+    free_weights(&weights);
+    return 0;
+}
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -30,7 +30,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
+        params.seed = uint32_t(time(NULL));
    }
    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);

@@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl()];
+        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
-        //     logits[llama_token_nl()] = nl_logit;
+        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }

        if (temp <= 0) {
@@ -211,10 +211,10 @@ const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
-    if (id == llama_token_eos()) {
+    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
-        ret = llama_token_to_str(ctx, id);
+        ret = llama_token_to_piece(ctx, id);
    }
    eval_id(mymodel, id);
    return ret.c_str();
--- a/examples/embd-input/embd_input.py
+++ b/examples/embd-input/embd_input.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import ctypes
 from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
 import numpy as np
--- a/examples/embd-input/llava.py
+++ b/examples/embd-input/llava.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
--- a/examples/embd-input/panda_gpt.py
+++ b/examples/embd-input/panda_gpt.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import sys
 import os
 sys.path.insert(0, os.path.dirname(__file__))
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -56,9 +56,6 @@ int main(int argc, char ** argv) {

    int n_past = 0;

-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

@@ -67,28 +64,35 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }

-    if (params.embedding){
-        if (embd_inp.size() > 0) {
-            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return 1;
-            }
-        }
-
-        const int n_embd = llama_n_embd(ctx);
-        const auto embeddings = llama_get_embeddings(ctx);
-
-        for (int i = 0; i < n_embd; i++) {
-            printf("%f ", embeddings[i]);
-        }
-        printf("\n");
+    if (embd_inp.size() > (size_t)params.n_ctx) {
+        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
+                __func__, embd_inp.size(), params.n_ctx);
+        return 1;
    }

+    while (!embd_inp.empty()) {
+        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
+        if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return 1;
+        }
+        n_past += n_tokens;
+        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
+    }
+
+    const int n_embd = llama_n_embd(ctx);
+    const auto embeddings = llama_get_embeddings(ctx);
+
+    for (int i = 0; i < n_embd; i++) {
+        printf("%f ", embeddings[i]);
+    }
+    printf("\n");
+
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -0,0 +1,249 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+template<typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+bool gguf_ex_write(const std::string & fname) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
+    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
+    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
+    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
+    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
+    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
+    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
+    gguf_set_val_u64 (ctx, "some.parameter.uint64",   0x123456789abcdef0ull);
+    gguf_set_val_i64 (ctx, "some.parameter.int64",   -0x123456789abcdef1ll);
+    gguf_set_val_f64 (ctx, "some.parameter.float64",  0.1234567890123456789);
+    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
+    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
+
+    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
+    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
+    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 128ull*1024ull*1024ull,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_data = ggml_init(params);
+
+    const int n_tensors = 10;
+
+    // tensor infos
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = "tensor_" + to_string(i);
+
+        int64_t ne[GGML_MAX_DIMS] = { 1 };
+        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
+
+        for (int j = 0; j < n_dims; ++j) {
+            ne[j] = rand() % 10 + 1;
+        }
+
+        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
+        ggml_set_name(cur, name.c_str());
+
+        {
+            float * data = (float *) cur->data;
+            for (int j = 0; j < ggml_nelements(cur); ++j) {
+                data[j] = 100 + i;
+            }
+        }
+
+        gguf_add_tensor(ctx, cur);
+    }
+
+    gguf_write_to_file(ctx, fname.c_str(), false);
+
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+// just read tensor info
+bool gguf_ex_read_0(const std::string & fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ NULL,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // find kv string
+    {
+        const char * findkey = "some.parameter.string";
+
+        const int keyidx = gguf_find_key(ctx, findkey);
+        if (keyidx == -1) {
+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
+        } else {
+            const char * key_value = gguf_get_val_str(ctx, keyidx);
+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    gguf_free(ctx);
+
+    return true;
+}
+
+// read and create ggml_context containing the tensors and their data
+bool gguf_ex_read_1(const std::string & fname) {
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    // data
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
+
+            const char * name = gguf_get_tensor_name(ctx, i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+
+            // print first 10 elements
+            const float * data = (const float *) cur->data;
+
+            printf("%s data[:10] : ", name);
+            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
+                printf("%f ", data[j]);
+            }
+            printf("\n\n");
+
+            // check data
+            {
+                const float * data = (const float *) cur->data;
+                for (int j = 0; j < ggml_nelements(cur); ++j) {
+                    if (data[j] != 100 + i) {
+                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
+        return -1;
+    }
+
+    const std::string fname(argv[1]);
+    const std::string mode (argv[2]);
+
+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
+
+    if (mode == "w") {
+        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
+    } else if (mode == "r") {
+        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+    }
+
+    return 0;
+}
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import matplotlib.pyplot as plt
 import os
 import csv
--- a/examples/jeopardy/jeopardy.sh
+++ b/examples/jeopardy/jeopardy.sh
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import re
+import sys
+
+# whitespace is constrained to a single space char to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE = '" "?'
+
+PRIMITIVE_RULES = {
+    'boolean': '("true" | "false") space',
+    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
+    'string': r''' "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space ''',
+    'null': '"null" space',
+}
+
+INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
+
+
+class SchemaConverter:
+    def __init__(self, prop_order):
+        self._prop_order = prop_order
+        self._rules = {'space': SPACE_RULE}
+
+    def _format_literal(self, literal):
+        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
+        )
+        return f'"{escaped}"'
+
+    def _add_rule(self, name, rule):
+        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
+        if esc_name not in self._rules or self._rules[esc_name] == rule:
+            key = esc_name
+        else:
+            i = 0
+            while f'{esc_name}{i}' in self._rules:
+                i += 1
+            key = f'{esc_name}{i}'
+        self._rules[key] = rule
+        return key
+
+    def visit(self, schema, name):
+        schema_type = schema.get('type')
+        rule_name = name or 'root'
+
+        if 'oneOf' in schema or 'anyOf' in schema:
+            rule = ' | '.join((
+                self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
+                for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
+            ))
+            return self._add_rule(rule_name, rule)
+
+        elif 'const' in schema:
+            return self._add_rule(rule_name, self._format_literal(schema['const']))
+
+        elif 'enum' in schema:
+            rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type == 'object' and 'properties' in schema:
+            # TODO: `required` keyword
+            prop_order = self._prop_order
+            prop_pairs = sorted(
+                schema['properties'].items(),
+                # sort by position in prop_order (if specified) then by key
+                key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+            )
+
+            rule = '"{" space'
+            for i, (prop_name, prop_schema) in enumerate(prop_pairs):
+                prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
+                if i > 0:
+                    rule += ' "," space'
+                rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
+            rule += ' "}" space'
+
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type == 'array' and 'items' in schema:
+            # TODO `prefixItems` keyword
+            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
+            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+            return self._add_rule(rule_name, rule)
+
+        else:
+            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
+            return self._add_rule(
+                'root' if rule_name == 'root' else schema_type,
+                PRIMITIVE_RULES[schema_type]
+            )
+
+    def format_grammar(self):
+        return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
+
+
+def main(args_in = None):
+    parser = argparse.ArgumentParser(
+        description='''
+            Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
+            given JSON schema. Only a subset of JSON schema features are supported; more may be
+            added in the future.
+        ''',
+    )
+    parser.add_argument(
+        '--prop-order',
+        default=[],
+        type=lambda s: s.split(','),
+        help='''
+            comma-separated property names defining the order of precedence for object properties;
+            properties not specified here are given lower precedence than those that are, and are
+            sorted alphabetically
+        '''
+    )
+    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
+    args = parser.parse_args(args_in)
+
+    schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
+    prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
+    converter = SchemaConverter(prop_order)
+    converter.visit(schema, '')
+    print(converter.format_grammar())
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-bench)
+add_executable(${TARGET} llama-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -0,0 +1,132 @@
+" Requires an already running llama.cpp server
+" To install either copy or symlink to ~/.vim/autoload/llama.vim
+" Then start with either :call llama#doLlamaGen(),
+" or add a keybind to your vimrc such as
+" nnoremap Z :call llama#doLlamaGen()<CR>
+" Similarly, you could add an insert mode keybind with
+" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
+"
+" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" let g:llama_api_url = "192.168.1.10:8080"
+" llama_overrides can also be set through buffer/window scopes. For instance
+" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
+" Could be added to your .vimrc to automatically set a lower temperature when
+" editing a python script
+" Additionally, an override dict can be stored at the top of a file
+" !*{"stop": ["User:"]}
+" Could be added to the start of your chatlog.txt to set the stopping token
+" These parameter dicts are merged together from lowest to highest priority:
+" server default -> g:llama_overrides -> w:llama_overrides ->
+" b:llama_overrides -> in file (!*) overrides
+"
+" Sublists (like logit_bias and stop) are overridden, not merged
+" Example override:
+" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
+if !exists("g:llama_api_url")
+    let g:llama_api_url= "127.0.0.1:8080"
+endif
+if !exists("g:llama_overrides")
+   let g:llama_overrides = {}
+endif
+const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
+const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
+let s:linedict = {}
+
+func s:callbackHandler(bufn, channel, msg)
+   if len(a:msg) < 3
+      return
+   elseif a:msg[0] == "d"
+      let l:msg = a:msg[6:-1]
+   else
+      let l:msg = a:msg
+   endif
+   let l:decoded_msg = json_decode(l:msg)
+   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
+   if len(l:newtext) > 0
+      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
+   else
+      echo "nothing genned"
+   endif
+   if len(newtext) > 1
+      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
+      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
+   endif
+   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
+       echo "Finished generation"
+   endif
+endfunction
+
+func llama#doLlamaGen()
+   if exists("b:job")
+      if job_status(b:job) == "run"
+         call job_stop(b:job)
+         return
+      endif
+   endif
+
+   let l:cbuffer = bufnr("%")
+   let s:linedict[l:cbuffer] = line('$')
+   let l:buflines = getbufline(l:cbuffer, 1, 1000)
+   let l:querydata = copy(s:querydata)
+   call extend(l:querydata, g:llama_overrides)
+   if exists("w:llama_overrides")
+      call extend(l:querydata, w:llama_overrides)
+   endif
+   if exists("b:llama_overrides")
+      call extend(l:querydata, b:llama_overrides)
+   endif
+   if l:buflines[0][0:1] == '!*'
+      let l:userdata = json_decode(l:buflines[0][2:-1])
+      call extend(l:querydata, l:userdata)
+      let l:buflines = l:buflines[1:-1]
+   endif
+   let l:querydata.prompt = join(l:buflines, "\n")
+   let l:curlcommand = copy(s:curlcommand)
+   let l:curlcommand[2] = json_encode(l:querydata)
+   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
+endfunction
+
+" Echos the tokkenization of the provided string , or cursor to end of word
+" Onus is placed on the user to include the preceding space
+func llama#tokenizeWord(...)
+    if (a:0 > 0)
+        let l:input = a:1
+    else
+        exe "normal \"*ye"
+        let l:input = @*
+    endif
+    let l:querydata = {"content": l:input}
+    let l:curlcommand = copy(s:curlcommand)
+    let l:curlcommand[2] = json_encode(l:querydata)
+    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
+   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
+endfunction
+
+func s:tokenizeWordCallback(plaintext, channel, msg)
+    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
+endfunction
+
+
+" Echos the token count of the entire buffer (or provided string)
+" Example usage :echo llama#tokenCount()
+func llama#tokenCount(...)
+    if (a:0 > 0)
+        let l:buflines = a:1
+    else
+        let l:buflines = getline(1,1000)
+        if l:buflines[0][0:1] == '!*'
+            let l:buflines = l:buflines[1:-1]
+        endif
+        let l:buflines = join(l:buflines, "\n")
+    endif
+    let l:querydata = {"content": l:buflines}
+    let l:curlcommand = copy(s:curlcommand)
+    let l:curlcommand[2] = json_encode(l:querydata)
+    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
+   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
+endfunction
+
+func s:tokenCountCallback(channel, msg)
+    let resp = json_decode(a:msg)
+    echo len(resp.tokens)
+endfunction
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -1,3 +1,5 @@
+" Basic plugin example
+
 function! Llm()

  let url = "http://127.0.0.1:8080/completion"
@@ -16,8 +18,10 @@ function! Llm()
  " Extract the content field from the response
  let content = json_decode(response).content

+  let split_newlines = split(content, '\n', 1)
+
  " Insert the content at the cursor position
-  call setline(line('.'), getline('.') . content)
+  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
 endfunction

 command! Llm call Llm()
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by

 -   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

+### Extended Context Size
+
+Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+
+- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+
 ### Keep Prompt

 The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
@@ -154,9 +160,13 @@ The following options allow you to control the text generation process and fine-

 ### Number of Tokens to Predict

-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)

-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+
+If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.

 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

@@ -202,9 +212,9 @@ Example usage: `--top-p 0.95`

 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.

-Example usage: `--tfs 2.0`
+Example usage: `--tfs 0.95`

 ### Locally Typical Sampling

@@ -278,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

+### Grammars
+
+-   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
+
 ### Quantization

 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -4,6 +4,7 @@
 #endif

 #include "common.h"
+#include "console.h"
 #include "llama.h"
 #include "build-info.h"
 #include "grammar-parser.h"
@@ -35,18 +36,16 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static console_state con_st;
 static llama_context ** g_ctx;
-
 static bool is_interacting = false;

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting) {
-            is_interacting=true;
+            is_interacting = true;
        } else {
-            console_cleanup(con_st);
+            console::cleanup();
            printf("\n");
            llama_print_timings(*g_ctx);
            _exit(130);
@@ -64,10 +63,8 @@ int main(int argc, char ** argv) {

    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
-    con_st.use_color = params.use_color;
-    con_st.multiline_input = params.multiline_input;
-    console_init(con_st);
-    atexit([]() { console_cleanup(con_st); });
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });

    if (params.perplexity) {
        printf("\n************\n");
@@ -146,7 +143,7 @@ int main(int argc, char ** argv) {
        {
            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);

-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
        }

@@ -192,27 +189,31 @@ int main(int argc, char ** argv) {
        }
    }

+    // Add BOS if SPM tokenizer
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+
    // tokenize the prompt
    std::vector<llama_token> embd_inp;

-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
    } else {
        embd_inp = session_tokens;
    }

+    // Should not run without any tokens
+    if (embd_inp.empty()) {
+        embd_inp.push_back(llama_token_bos(ctx));
+    }
+
    // Tokenize negative prompt
    std::vector<llama_token> guidance_inp;
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        params.cfg_negative_prompt.insert(0, 1, ' ');
-        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true);
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
    }
@@ -259,8 +260,8 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);

    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
@@ -273,15 +274,12 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (ctx_guidance) {
@@ -289,14 +287,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }

        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
            fprintf(stderr, "'\n");
        }
@@ -314,7 +312,7 @@ int main(int argc, char ** argv) {
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

        fprintf(stderr, "%s: interactive mode on.\n", __func__);
@@ -355,10 +353,9 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "\n");

        {
-            auto it = params.logit_bias.find(llama_token_eos());
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                fprintf(stderr,
-                    "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }

@@ -373,7 +370,7 @@ int main(int argc, char ** argv) {

    if (params.interactive) {
        const char *control_message;
-        if (con_st.multiline_input) {
+        if (params.multiline_input) {
            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
@@ -401,14 +398,14 @@ int main(int argc, char ** argv) {
    int n_past_guidance    = 0;

    // the first thing we will do is to output the prompt, so set color accordingly
-    console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+    console::set_display(console::prompt);

    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

    // do one empty run to warm up the model
    {
-        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(ctx);
    }
@@ -422,9 +419,9 @@ int main(int argc, char ** argv) {
            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int)embd.size() > max_embd_size) {
                auto skipped_tokens = embd.size() - max_embd_size;
-                console_set_color(con_st, CONSOLE_COLOR_ERROR);
+                console::set_display(console::error);
                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                console::set_display(console::reset);
                fflush(stdout);
                embd.resize(max_embd_size);
            }
@@ -434,8 +431,12 @@ int main(int argc, char ** argv) {
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                const int n_left = n_past - params.n_keep;
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }

+                const int n_left = n_past - params.n_keep;
                // always keep the first token - BOS
                n_past = std::max(1, params.n_keep);
                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
@@ -449,7 +450,7 @@ int main(int argc, char ** argv) {
                //printf("\n---\n");
                //printf("resetting: '");
                //for (int i = 0; i < (int) embd.size(); i++) {
-                //    printf("%s", llama_token_to_str(ctx, embd[i]));
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
                //}
                //printf("'\n");
                //printf("\n---\n");
@@ -502,7 +503,7 @@ int main(int argc, char ** argv) {
                    input_size = embd_guidance.size();
                    //fprintf(stderr, "\n---------------------\n");
                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
-                        //fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
                    //}
                    //fprintf(stderr, "\n---------------------\n");
                } else {
@@ -588,7 +589,7 @@ int main(int argc, char ** argv) {
                }

                // Apply penalties
-                float nl_logit = logits[llama_token_nl()];
+                float nl_logit = logits[llama_token_nl(ctx)];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -597,7 +598,12 @@ int main(int argc, char ** argv) {
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
-                    logits[llama_token_nl()] = nl_logit;
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
+                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
+                            candidates_p.data[idx].logit = nl_logit;
+                            break;
+                        }
+                    }
                }

                if (grammar != NULL) {
@@ -661,13 +667,13 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                printf("%s", llama_token_to_piece(ctx, id).c_str());
            }
            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int)embd_inp.size() == n_consumed) {
-            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+            console::set_display(console::reset);
        }

        // if not currently processing queued inputs;
@@ -677,7 +683,7 @@ int main(int argc, char ** argv) {
            if (params.antiprompt.size()) {
                std::string last_output;
                for (auto id : last_n_tokens) {
-                    last_output += llama_token_to_str(ctx, id);
+                    last_output += llama_token_to_piece(ctx, id);
                }

                is_antiprompt = false;
@@ -693,7 +699,7 @@ int main(int argc, char ** argv) {
                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
-                            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                            console::set_display(console::user_input);
                        }
                        is_antiprompt = true;
                        fflush(stdout);
@@ -703,7 +709,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos()) {
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
                if (params.interactive) {
                    if (params.antiprompt.size() != 0) {
                        // tokenize and inject first reverse prompt
@@ -714,7 +720,7 @@ int main(int argc, char ** argv) {

                    is_interacting = true;
                    printf("\n");
-                    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                    console::set_display(console::user_input);
                    fflush(stdout);
                } else if (params.instruct) {
                    is_interacting = true;
@@ -727,7 +733,7 @@ int main(int argc, char ** argv) {
                }

                if (params.input_prefix_bos) {
-                    embd_inp.push_back(llama_token_bos());
+                    embd_inp.push_back(llama_token_bos(ctx));
                }

                std::string buffer;
@@ -739,12 +745,12 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
-                    another_line = console_readline(con_st, line);
+                    another_line = console::readline(line, params.multiline_input);
                    buffer += line;
                } while (another_line);

                // done taking input, reset color
-                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                console::set_display(console::reset);

                // Add tokens to embd only if the input buffer is non-empty
                // Entering a empty line lets the user pass control back
@@ -781,8 +787,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);

-                        std::vector<const llama_grammar_element *> grammar_rules(
-                            parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
@@ -793,13 +798,14 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
+        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
+        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
            n_remain = params.n_predict;
            is_interacting = true;
        }
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
 This script converts Hugging Face llama models to GGML and quantizes them.

--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -2,7 +2,7 @@
 //
 // - First, export a LLaMA graph:
 //
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
 //
 // - Run this tool to evaluate the exported graph:
 //
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -5,6 +5,9 @@
 #include <cmath>
 #include <ctime>
 #include <sstream>
+#include <cstring>
+#include <thread>
+#include <mutex>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -26,14 +29,69 @@ std::vector<float> softmax(const std::vector<float>& logits) {
    return probs;
 }

-void perplexity(llama_context * ctx, const gpt_params & params) {
+float log_softmax(int n_vocab, const float * logits, int tok) {
+    float max_logit = logits[0];
+    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
+    double sum_exp = 0.0;
+    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
+    return logits[tok] - max_logit - log(sum_exp);
+}
+
+void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
+        double& nll, double& nll2) {
+
+    std::mutex mutex;
+    int counter = 0;
+    auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
+        double local_nll = 0, local_nll2 = 0;
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex);
+            int i = counter++;
+            if (i >= n_token) {
+                nll += local_nll; nll2 += local_nll2;
+                break;
+            }
+            lock.unlock();
+            double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
+            local_nll += v;
+            local_nll2 += v*v;
+        }
+    };
+    for (auto& w : workers) w = std::thread(compute);
+    compute();
+    for (auto& w : workers) w.join();
+
+}
+
+void perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval
-    auto tokens = ::llama_tokenize(ctx, params.prompt, true);

-    const int n_chunk_max = tokens.size() / params.n_ctx;
+    if (params.ppl_stride <= 0) {
+        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        return;
+    }
+
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool add_bos = is_spm;
+
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+
+    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+    const int calc_chunk = params.n_ctx;
+
+    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+
+    if (int(tokens.size()) <= calc_chunk) {
+        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+                tokens.size(), params.n_ctx, params.ppl_stride);
+        return;
+    }
+
+    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(ctx);
@@ -44,6 +102,116 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * params.ppl_stride;
+        const int end   = start + calc_chunk;
+
+        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
+        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+
+        std::vector<float> logits;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+                //fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (add_bos && j == 0) {
+                tokens[batch_start] = llama_token_bos(ctx);
+            }
+
+            const auto batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+
+            if (j == 0) {
+                tokens[batch_start] = token_org;
+            }
+        }
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+        }
+
+        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
+
+            // Calculate probability of next token, given the previous ones.
+            const std::vector<float> tok_logits(
+                logits.begin() + (j + 0) * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+
+            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+
+            nll += -std::log(prob);
+            ++count;
+        }
+        // perplexity is e^(average negative log-likelihood)
+        if (params.ppl_output_type == 0) {
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        } else {
+            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+        }
+        fflush(stdout);
+    }
+    printf("\n");
+}
+
+void perplexity(llama_context * ctx, const gpt_params & params) {
+    if (params.ppl_stride > 0) {
+        perplexity_v2(ctx, params);
+        return;
+    }
+
+    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Output: `perplexity: 13.5106 [114/114]`
+    // BOS tokens will be added for each chunk before eval
+
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    const bool add_bos = is_spm;
+
+    auto tim1 = std::chrono::high_resolution_clock::now();
+    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+
+    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+
+    auto tim2 = std::chrono::high_resolution_clock::now();
+    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+
+    const int n_chunk_max = tokens.size() / params.n_ctx;
+
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
+    const int n_vocab = llama_n_vocab(ctx);
+    const int n_batch = params.n_batch;
+
+    int count = 0;
+    double nll = 0.0;
+    double nll2 = 0.0;
+
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+
+    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
+
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * params.n_ctx;
        const int end   = start + params.n_ctx;
@@ -62,8 +230,8 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
            const auto token_org = tokens[batch_start];

            // add BOS token for the first batch of each chunk
-            if (j == 0) {
-                tokens[batch_start] = llama_token_bos();
+            if (add_bos && j == 0) {
+                tokens[batch_start] = llama_token_bos(ctx);
            }

            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
@@ -88,7 +256,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

        // We get the logits for all the tokens in the context window (params.n_ctx)
@@ -103,26 +271,72 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
+        const int first = std::min(512, params.n_ctx/2);
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
+        count += params.n_ctx - first - 1;

-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
-
-            nll += -std::log(prob);
-            ++count;
-        }
        // perplexity is e^(average negative log-likelihood)
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        if (params.ppl_output_type == 0) {
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        } else {
+            double av = nll/count;
+            double av2 = nll2/count - av*av;
+            if (av2 > 0) av2 = sqrt(av2/(count-1));
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
+        }
        fflush(stdout);
    }
    printf("\n");
+    nll2 /= count;
+    nll /= count;
+    nll2 -= nll * nll;
+    if (nll2 > 0) {
+        nll2 = sqrt(nll2/(count-1));
+        double ppl = exp(nll);
+        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+    } else {
+        printf("Unexpected negative standard deviation of log(prob)\n");
+    }
 }

-void perplexity_lines(llama_context * ctx, const gpt_params & params) {
-    // Calculates perplexity over each line of the prompt
+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+        int n_vocab, int n_thread) {
+    std::vector<float> result;
+    result.reserve(tokens.size() * n_vocab);
+    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
+    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
+        size_t n_tokens = tokens.size() - i_chunk * n_batch;
+        n_tokens = std::min(n_tokens, size_t(n_batch));
+        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return {};
+        }
+
+        const auto logits = llama_get_logits(ctx);
+        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
+
+        n_past += n_tokens;
+    }
+    return result;
+}
+
+void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+    // Calculates hellaswag score (acc_norm) from prompt
+    //
+    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+    // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
+    //
+    // All 10042 tasks should be extracted to keep the results standardized like other implementations.
+    //
+    // Datafile layout:
+    // ['??'] denotes json fields
+    // 6 lines per task:
+    // ['activity_label'] + ": " +['ctx']  - The first part of the query, the context
+    // ['label'] - The index the best common sense ending aka gold ending
+    // ['endings'][0] - Endings added to the first part of the query
+    // ['endings'][1]
+    // ['endings'][2]
+    // ['endings'][3]

    std::vector<std::string> prompt_lines;
    std::istringstream strstream(params.prompt);
@@ -132,63 +346,207 @@ void perplexity_lines(llama_context * ctx, const gpt_params & params) {
        prompt_lines.push_back(line);
    }

+    if( prompt_lines.size() % 6 != 0) {
+        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        return;
+    }
+
+    size_t hs_task_count = prompt_lines.size()/6;
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
+    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
+
+    // This is needed as usual for LLaMA models
+    const bool add_bos = is_spm;
+
+    // Number of tasks to use when computing the score
+    if ( params.hellaswag_tasks < hs_task_count  ) {
+        hs_task_count = params.hellaswag_tasks;
+    }
+
+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
+    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
+    std::mt19937 rng(1);
+
+    // Dataholder for hellaswag tasks
+    struct hs_data_t {
+        std::string context;
+        size_t gold_ending_idx;
+        std::string ending[4];
+        size_t ending_logprob_count[4];
+        double ending_logprob[4];
+    };
+
+    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+
+    // Select and read data from prompt lines
+    hs_data_t *hs_data = new hs_data_t[hs_task_count];
+    for (size_t i=0; i < hs_task_count; i++) {
+        size_t idx = i;
+
+        // Select a random example of those left in the prompt
+        if (randomize_tasks) {
+            std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
+            idx = dist(rng);
+        }
+
+        hs_data[i].context = prompt_lines[idx*6];
+        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
+        for (size_t j=0; j < 4; j++) {
+            hs_data[i].ending[j] = prompt_lines[idx*6+2+j];
+        }
+
+        // Delete the selected random example from the prompt
+        if (randomize_tasks) {
+            prompt_lines.erase( std::next(prompt_lines.begin(),idx*6)  , std::next(prompt_lines.begin(),idx*6+6) );
+        }
+    }
+
+    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    printf("\ntask\tacc_norm\n");
+
+    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);

-    int counttotal   = 0;
-    size_t n_lines = prompt_lines.size();
+    std::vector<std::vector<int>> ending_tokens(4);

-    double nll = 0.0;
+    std::vector<float> tok_logits(n_vocab);

-    fprintf(stderr, "%s: calculating perplexity over %lu lines\n", __func__, n_lines);
+    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
+        // Tokenize the context to count tokens
+        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
+        size_t context_size = context_embd.size();

-    printf("\nLine\tPPL line\tPPL cumulative\n");
+        for (int i = 0; i < 4; ++i) {
+            ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + " " + hs_data[task_idx].ending[i], add_bos);
+            for (int k = 0; k < int(context_size); ++k) {
+                if (ending_tokens[i][k] != context_embd[k]) {
+                    fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
+                    break;
+                }
+            }
+        }

-    for (size_t i = 0; i < n_lines; ++i) {
+        // Do the 1st ending
+        // In this case we include the context when evaluating
+        //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
+        auto query_embd = ending_tokens[0];
+        auto query_size = query_embd.size();

-        // Tokenize and insert BOS at start
-        std::vector<int> batch_embd = ::llama_tokenize(ctx, prompt_lines[i], true);
-
-        size_t batch_size  = batch_embd.size();
-
-        // Stop if line is too long
-        if( batch_size > (size_t)params.n_ctx ) {
-            fprintf(stderr, "%s : tokens in line %lu > n_ctxl\n", __func__, i);
+        // Stop if query wont fit the ctx window
+        if (query_size > (size_t)params.n_ctx) {
+            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
            return;
        }

-        if (llama_eval(ctx, batch_embd.data(), batch_size, 0, params.n_threads)) {
+        // Speedup small evaluations by evaluating atleast 32 tokens
+        if (query_size < 32) {
+            query_embd.resize(32);
+        }
+
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
+        if (logits.empty()) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return;
        }

-        const auto batch_logits = llama_get_logits(ctx);
-        std::vector<float> logits;
-        logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
+        const auto first_probs = softmax(tok_logits);

-        double nllline = 0.0;
-        int countline = 0;
+        hs_data[task_idx].ending_logprob_count[0] = 1;
+        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);

-        // Perplexity over second half of the line
-        for (size_t j = batch_size/2; j < batch_size - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
+        // Calculate the logprobs over the ending
+        for (size_t j = context_size; j < query_size - 1; j++) {

-            const float prob = softmax(tok_logits)[batch_embd[ j + 1]];
+            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));

-            nllline += -std::log(prob);
-            ++countline;
+            const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+            hs_data[task_idx].ending_logprob[0] += std::log(prob);
+            hs_data[task_idx].ending_logprob_count[0]++;
        }

-        nll += nllline;
-        counttotal += countline;
+        // Calculate the mean token logprob for acc_norm
+        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];

-        // perplexity is e^(average negative log-likelihood)
-        printf("%lu\t%.8lf\t%.8lf\n", i + 1, std::exp(nllline/countline), std::exp(nll / counttotal) );
+        // Do the remaining endings
+        // For these, we use the bare ending with n_past = context_size
+        //
+        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
+
+            // Tokenize the query
+            query_embd.resize(ending_tokens[ending_idx].size() - context_size);
+            std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
+            query_size = query_embd.size();
+
+            // Stop if query wont fit the ctx window
+            if (context_size + query_size > (size_t)params.n_ctx) {
+                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+                return;
+            }
+
+            // Speedup small evaluations by evaluating atleast 32 tokens
+            // No, resizing to 32 is actually slightly slower (at least on CUDA)
+            //if (query_size < 32) {
+            //    query_embd.resize(32);
+            //}
+
+            // Evaluate the query
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
+            if (logits.empty()) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
+            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
+
+            // Calculate the logprobs over the ending
+            for (size_t j = 0; j < query_size - 1; j++) {
+                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
+
+                const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
+                hs_data[task_idx].ending_logprob_count[ending_idx]++;
+            }
+
+            // Calculate the mean token logprob for acc_norm
+            hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
+
+
+//            printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
+//                task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
+        }
+
+        // Find the ending with maximum logprob
+        size_t ending_logprob_max_idx = 0;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
+        for (size_t j = 1; j < 4; j++) {
+            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
+                ending_logprob_max_idx = j;
+                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
+            }
+        }
+
+//        printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
+
+        // If the gold ending got the maximum logprobe add one accuracy point
+        if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
+            acc += 1.0;
+        }
+
+        // Print the accumulated accuracy mean x 100
+        printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
        fflush(stdout);
    }

+    delete [] hs_data;
+
    printf("\n");
 }

@@ -203,6 +561,12 @@ int main(int argc, char ** argv) {
    params.perplexity = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);

+    if (params.ppl_stride > 0) {
+        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+                params.n_ctx, params.n_ctx + params.ppl_stride/2);
+        params.n_ctx += params.ppl_stride/2;
+    }
+
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                "expect poor results\n", __func__, params.n_ctx);
@@ -240,8 +604,8 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

-    if (params.perplexity_lines) {
-        perplexity_lines(ctx, params);
+    if (params.hellaswag) {
+        hellaswag_score(ctx, params);
    } else {
        perplexity(ctx, params);
    }
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -24,7 +24,7 @@
 #endif

 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.bin";
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -14,25 +14,25 @@ struct quant_option {
 };

 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.50G, +0.2499 ppl @ 7B", },
-    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1846 ppl @ 7B", },
-    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.30G, +0.0796 ppl @ 7B", },
-    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0415 ppl @ 7B", },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
 #ifdef GGML_USE_K_QUANTS
-    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.67G, +0.8698 ppl @ 7B", },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
-    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
-    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
-    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
-    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
-    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
-    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0044 ppl @ 7B", },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
 #endif
-    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ 7B", },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
 };
@@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }

 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
        }
    }

-    if (argc - arg_idx < 3) {
+    if (argc - arg_idx < 2) {
        usage(argv[0]);
    }

@@ -114,12 +114,12 @@ int main(int argc, char ** argv) {
    std::string ftype_str;
    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
-        const size_t pos = fname_inp.find_last_of('/');
+        const size_t pos = fname_inp.find_last_of("/\\");
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@@ -1,4 +1,3 @@
-
 #!/bin/bash

 cd `dirname $0`
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -44,9 +44,8 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
-
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
@@ -88,10 +87,10 @@ int main(int argc, char ** argv) {
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx, next_token);
+        auto next_token_str = llama_token_to_piece(ctx, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@@ -148,10 +147,10 @@ int main(int argc, char ** argv) {
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
-        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        auto next_token_str = llama_token_to_piece(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+cd "$(dirname "$0")/.." || exit
+
+# Specify the model you want to use here:
+MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
+
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-12}"
+
+# Note: you can also override the generation options by specifying them on the command line:
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
+
+
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./server $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --rope-freq-scale 1.0 \
+  "$@"
+
+# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
+# -ngl 1 \
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 Command line options:

 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@@ -16,6 +16,7 @@ Command line options:
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+-   `--numa`: Attempt optimizations that help on some NUMA systems.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
@@ -47,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./server -m models/7B/ggml-model.bin -c 2048
+./server -m models/7B/ggml-model.gguf -c 2048
 ```

 ### Windows:

 ```powershell
-server.exe -m models\7B\ggml-model.bin -c 2048
+server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
-
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.

@@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
 ```bash
 mkdir llama-client
 cd llama-client
-npm init
-npm install axios
 ```

 Create a index.js file and put inside this:

 ```javascript
-const axios = require("axios");
-
 const prompt = `Building a website can be done in 10 simple steps:`;

 async function Test() {
-    let result = await axios.post("http://127.0.0.1:8080/completion", {
-        prompt,
-        n_predict: 512,
-    });
-
-    // the response is received until completion finish
-    console.log(result.data.content);
+    let response = await fetch("http://127.0.0.1:8080/completion", {
+        method: 'POST',
+        body: JSON.stringify({
+            prompt,
+            n_predict: 512,
+        })
+    })
+    console.log((await response.json()).content)
 }

-Test();
+Test()
 ```

 And run it:

 ```bash
-node .
+node index.js
 ```

 ## API Endpoints
@@ -126,7 +123,7 @@ node .

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

-    `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
+    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.

    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -151,6 +148,8 @@ node .

    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).

+    `grammar`: Set grammar for grammar-based sampling (default: no grammar)
+
    `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).

    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
@@ -163,7 +162,13 @@ node .

    `content`: Set the text to tokenize.

-    Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
+    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
+
+-   **POST** `/detokenize`: Convert tokens to text.
+
+    *Options:*
+
+    `tokens`: Set the tokens to detokenize.

 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 from flask import Flask, jsonify, request, Response
 import urllib.parse
--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+)
+
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+format_prompt() {
+    if [[ "${#CHAT[@]}" -eq 0 ]]; then
+        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
+    else
+        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
+        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
+    fi
+}
+
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+
+N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
+
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 1024,
+        stop: ["[INST]"],
+        stream: true
+    }')"
+
+    # Create a temporary file to hold the Python output
+    TEMPFILE=$(mktemp)
+
+    exec 3< <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
+        --data-raw "${DATA}")
+
+    python -c "
+import json
+import sys
+
+answer = ''
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    if line.startswith('data: '):
+        json_content = line[6:].strip()
+        content = json.loads(json_content)['content']
+        sys.stdout.write(content)
+        sys.stdout.flush()
+        answer += content
+
+answer = answer.rstrip('\n')
+
+# Write the answer to the temporary file
+with open('$TEMPFILE', 'w') as f:
+    f.write(answer)
+    " <&3
+
+    exec 3<&-
+
+    # Read the answer from the temporary file
+    ANSWER=$(cat $TEMPFILE)
+
+    # Clean up the temporary file
+    rm $TEMPFILE
+
+    printf "\n"
+
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+
+while true; do
+    echo -en "\033[0;32m"  # Green color
+    read -r -e -p "> " QUESTION
+    echo -en "\033[0m"  # Reset color
+    chat_completion "${QUESTION}"
+done
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -1,5 +1,34 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
+import { readFileSync } from 'node:fs'
+import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+
+const args = process.argv.slice(2);
+const grammarJsonSchemaFile = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema"
+);
+const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
+
+// Example usage: function,arguments
+const grammarJsonSchemaPropOrder = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
+);
+const propOrder = grammarJsonSchemaPropOrder
+    ? grammarJsonSchemaPropOrder
+          .split(",")
+          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
+    : {};
+
+let grammar = null
+if (grammarJsonSchemaFile) {
+    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter(propOrder)
+    converter.visit(schema, '')
+    grammar = converter.formatGrammar()
+}
+if (grammarFile) {
+    grammar = readFileSync(grammarFile, 'utf-8')
+}

 const API_URL = 'http://127.0.0.1:8080'

@@ -48,6 +77,7 @@ async function chat_completion(question) {
            n_keep: n_keep,
            n_predict: 256,
            stop: ["\n### Human:"], // stop completion after generating this
+            grammar,
            stream: true,
        })
    })
--- a/examples/server/chat.sh
+++ b/examples/server/chat.sh
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -87,289 +87,342 @@ unsigned char completion_js[] = {
  0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
  0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20,
-  0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72,
-  0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61,
-  0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68,
-  0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69,
-  0x70, 0x6c, 0x65, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66,
-  0x3a, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69,
-  0x74, 0x68, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61,
-  0x79, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61,
-  0x73, 0x20, 0x61, 0x20, 0x6b, 0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20,
-  0x6f, 0x75, 0x72, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69,
-  0x6e, 0x6c, 0x79, 0x20, 0x63, 0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f,
-  0x75, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a,
-  0x20, 0x6b, 0x65, 0x79, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77,
-  0x68, 0x69, 0x63, 0x68, 0x20, 0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65,
-  0x63, 0x74, 0x20, 0x61, 0x73, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
-  0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61,
-  0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61,
-  0x64, 0x64, 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20,
-  0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73,
-  0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20,
-  0x74, 0x65, 0x78, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c,
-  0x6c, 0x28, 0x72, 0x65, 0x67, 0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d,
-  0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
-  0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
-  0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
-  0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53,
-  0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79,
-  0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79,
-  0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69,
-  0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73,
-  0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72,
-  0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77,
-  0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
-  0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63,
-  0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d,
-  0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74,
-  0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
-  0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
-  0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
-  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79,
-  0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, 0x72,
-  0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
-  0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
-  0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
-  0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
-  0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
-  0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
-  0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
-  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
-  0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
-  0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
-  0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
-  0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
-  0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
-  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
-  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
-  0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+  0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f,
+  0x20, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20,
+  0x70, 0x61, 0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65,
+  0x61, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20,
+  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75,
+  0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c,
+  0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69,
+  0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61,
+  0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f,
+  0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c,
+  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61,
+  0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72,
+  0x65, 0x6e, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d,
+  0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20,
+  0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f,
+  0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
+  0x42, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x2e, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c,
+  0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x65, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74,
+  0x65, 0x78, 0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c,
+  0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65,
+  0x78, 0x74, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65,
+  0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69,
+  0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68,
+  0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20,
+  0x6c, 0x69, 0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74,
+  0x20, 0x69, 0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64,
+  0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74,
+  0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x21, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69,
+  0x6e, 0x65, 0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e,
+  0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
+  0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20,
+  0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
+  0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76,
+  0x65, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65,
+  0x61, 0x6b, 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e,
+  0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73,
+  0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64,
+  0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20,
+  0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e,
+  0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e,
+  0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63,
+  0x68, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78,
+  0x65, 0x63, 0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61,
+  0x74, 0x63, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b,
+  0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20,
+  0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69,
+  0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73,
+  0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d,
+  0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d,
+  0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c,
+  0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20,
+  0x61, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e,
+  0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
+  0x2c, 0x20, 0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72,
+  0x65, 0x61, 0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61,
+  0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
-  0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
-  0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
-  0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
-  0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
-  0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
-  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
-  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
-  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
-  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
-  0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
-  0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
-  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
-  0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
-  0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
-  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
-  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
-  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
-  0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
-  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20,
+  0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61,
+  0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72,
+  0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20,
+  0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20,
+  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20,
+  0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63,
+  0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f,
+  0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f,
+  0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20,
+  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72,
+  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
+  0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78,
+  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
-  0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
-  0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
-  0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
-  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
-  0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
-  0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
-  0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
-  0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
-  0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
-  0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
-  0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
-  0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
-  0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
-  0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
-  0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
-  0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
+  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
+  0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f,
+  0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
+  0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70,
+  0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65,
+  0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e,
+  0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c,
+  0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
+  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
+  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
+  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
+  0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
+  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
+  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
+  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
+  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
+  0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
+  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
+  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f,
+  0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69,
+  0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29,
+  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65,
+  0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
+  0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20,
+  0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e,
+  0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72,
+  0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f,
+  0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
+  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f,
+  0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
+  0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20,
+  0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20,
+  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e,
+  0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20,
+  0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
+  0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c,
+  0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68,
+  0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28,
+  0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a,
+  0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72,
+  0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a,
+  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c,
+  0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
+  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
+  0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65,
+  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20,
+  0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68,
+  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
+  0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20,
+  0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d,
+  0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61,
+  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22,
+  0x2f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22,
+  0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20,
+  0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f,
-  0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29,
-  0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72,
-  0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 };
-unsigned int completion_js_len = 4462;
+unsigned int completion_js_len = 5099;
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline

 FILES=$(ls $PUBLIC)

+cd $PUBLIC
 for FILE in $FILES; do
-  func=$(echo $FILE | tr '.' '_')
-  echo "generate $FILE.hpp ($func)"
-  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
 done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
@@ -0,0 +1,311 @@
+unsigned char json_schema_to_grammar_mjs[] = {
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f,
+  0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f,
+  0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
+  0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65,
+  0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e,
+  0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22,
+  0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b,
+  0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29,
+  0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d,
+  0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d,
+  0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69,
+  0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d,
+  0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20,
+  0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a,
+  0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20,
+  0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22,
+  0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20,
+  0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c,
+  0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c,
+  0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75,
+  0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20,
+  0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22,
+  0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20,
+  0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d,
+  0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45,
+  0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52,
+  0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d,
+  0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52,
+  0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41,
+  0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20,
+  0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27,
+  0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e,
+  0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22,
+  0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f,
+  0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c,
+  0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c,
+  0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55,
+  0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53,
+  0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79,
+  0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65,
+  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54,
+  0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f,
+  0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20,
+  0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c,
+  0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50,
+  0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d,
+  0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f,
+  0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44,
+  0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f,
+  0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73,
+  0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28,
+  0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d,
+  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
+  0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20,
+  0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70,
+  0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f,
+  0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f,
+  0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c,
+  0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68,
+  0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d,
+  0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69,
+  0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
+  0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65,
+  0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
+  0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65,
+  0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d,
+  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69,
+  0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f,
+  0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c,
+  0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+  0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63,
+  0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26,
+  0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
+  0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69,
+  0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72,
+  0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
+  0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f,
+  0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65,
+  0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72,
+  0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73,
+  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
+  0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20,
+  0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74,
+  0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b,
+  0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69,
+  0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42,
+  0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72,
+  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d,
+  0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65,
+  0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49,
+  0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64,
+  0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e,
+  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72,
+  0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70,
+  0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63,
+  0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73,
+  0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e,
+  0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20,
+  0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27,
+  0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20,
+  0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29,
+  0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20,
+  0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64,
+  0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61,
+  0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27,
+  0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20,
+  0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f,
+  0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74,
+  0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75,
+  0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
+  0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d,
+  0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65,
+  0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29,
+  0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64,
+  0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
+  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f,
+  0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61,
+  0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20,
+  0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49,
+  0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72,
+  0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72,
+  0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
+  0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20,
+  0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d,
+  0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d,
+  0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a
+};
+unsigned int json_schema_to_grammar_mjs_len = 3695;
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -43,6 +43,7 @@ export async function* llama(prompt, params = {}, config = {}) {
  const decoder = new TextDecoder();

  let content = "";
+  let leftover = ""; // Buffer for partially read lines

  try {
    let cont = true;
@@ -53,29 +54,47 @@ export async function* llama(prompt, params = {}, config = {}) {
        break;
      }

-      // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we
-      // mainly care about the data: key here, which we expect as json
-      const text = decoder.decode(result.value);
+      // Add any leftover data to the current chunk of data
+      const text = leftover + decoder.decode(result.value);

-      // parse all sse events and add them to result
-      const regex = /^(\S+):\s(.*)$/gm;
-      for (const match of text.matchAll(regex)) {
-        result[match[1]] = match[2]
+      // Check if the last character is a line break
+      const endsWithLineBreak = text.endsWith('\n');
+
+      // Split the text into lines
+      let lines = text.split('\n');
+
+      // If the text doesn't end with a line break, then the last line is incomplete
+      // Store it in leftover to be added to the next chunk of data
+      if (!endsWithLineBreak) {
+        leftover = lines.pop();
+      } else {
+        leftover = ""; // Reset leftover if we have a line break at the end
      }

-      // since we know this is llama.cpp, let's just decode the json in data
-      result.data = JSON.parse(result.data);
-      content += result.data.content;
+      // Parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const line of lines) {
+        const match = regex.exec(line);
+        if (match) {
+          result[match[1]] = match[2]
+          // since we know this is llama.cpp, let's just decode the json in data
+          if (result.data) {
+            result.data = JSON.parse(result.data);
+            content += result.data.content;

-      // yield
-      yield result;
+            // yield
+            yield result;

-      // if we got a stop token from server, we will break here
-      if (result.data.stop) {
-        if (result.data.generation_settings) {
-          generation_settings = result.data.generation_settings;
+            // if we got a stop token from server, we will break here
+            if (result.data.stop) {
+              if (result.data.generation_settings) {
+                generation_settings = result.data.generation_settings;
+              }
+              cont = false;
+              break;
+            }
+          }
        }
-        break;
      }
    }
  } catch (e) {
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -3,12 +3,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
  <title>llama.cpp - chat</title>

  <style>
    body {
-      background-color: #fff;
-      color: #000;
      font-family: system-ui;
      font-size: 90%;
    }
@@ -103,6 +102,17 @@
      padding: 0.5em;
    }

+    .prob-set {
+      padding: 0.3em;
+      border-bottom: 1px solid #ccc;
+    }
+
+    .popover-content {
+      position: absolute;
+      background-color: white;
+      padding: 0.2em;
+      box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
+    }

    textarea {
      padding: 5px;
@@ -134,22 +144,29 @@
      font-size: 80%;
      color: #888;
    }
+
+    @media (prefers-color-scheme: dark) {
+      .popover-content {
+        background-color: black;
+      }
+    }
  </style>

  <script type="module">
    import {
-      html, h, signal, effect, computed, render, useSignal, useEffect, useRef
+      html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
    } from '/index.js';

    import { llama } from '/completion.js';
+    import { SchemaConverter } from '/json-schema-to-grammar.mjs';

    const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
      type: "chat",
-      char: "llama",
+      char: "Llama",
      user: "User",
    })

@@ -167,8 +184,140 @@
      mirostat: 0, // 0/1/2
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
+      grammar: '',
+      n_probs: 0, // no completion_probabilities
    })

+    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfuly imported.
+
+      console.log('Processing saved templates and updating default template')
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Reseting themplate to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      params.value = t.data.params;
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
    const llamaStats = signal(null)
    const controller = signal(null)

@@ -203,10 +352,21 @@

      const prompt = template(session.value.template, {
        message: msg,
-        history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
+        history: session.value.transcript.flatMap(
+          ([name, data]) =>
+            template(
+              session.value.historyTemplate,
+              {
+                name,
+                message: Array.isArray(data) ?
+                  data.map(msg => msg.content).join('').replace(/^\s/, '') :
+                  data,
+              }
+            )
+        ).join("\n"),
      });

-      let currentMessage = '';
+      const currentMessages = [];
      const history = session.value.transcript

      const llamaParams = {
@@ -216,15 +376,19 @@

      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;
-        currentMessage += data.content;
-
-        // remove leading whitespace
-        currentMessage = currentMessage.replace(/^\s+/, "")
-
-        transcriptUpdate([...history, ["{{char}}", currentMessage]])

        if (data.stop) {
-          console.log("Completion finished: '", currentMessage, "', summary: ", data);
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
        }

        if (data.timings) {
@@ -283,13 +447,24 @@

      useEffect(() => {
        // scroll to bottom (if needed)
-        if (container.current && container.current.scrollHeight <= container.current.scrollTop + container.current.offsetHeight + 300) {
-          container.current.scrollTo(0, container.current.scrollHeight)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
        }
      }, [messages])

-      const chatLine = ([user, msg]) => {
-        return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
+      const chatLine = ([user, data], index) => {
+        let message
+        const isArrayMessage = Array.isArray(data)
+        if (params.value.n_probs > 0 && isArrayMessage) {
+          message = html`<${Probabilities} data=${data} />`
+        } else {
+          const text = isArrayMessage ?
+            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data;
+          message = html`<${Markdownish} text=${template(text)} />`
+        }
+        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
      };

      return html`
@@ -304,6 +479,26 @@
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }

+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = () => {
+        try {
+          const schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter(
+            grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+          )
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
      const FloatField = ({label, max, min, name, step, value}) => {
        return html`
          <div>
@@ -324,8 +519,34 @@
        `
      };

+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
      return html`
        <form>
+          <fieldset>
+            <${UserTemplateResetButton}/>
+          </fieldset>
+
          <fieldset>
            <div>
              <label for="prompt">Prompt</label>
@@ -355,6 +576,13 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
+
+            <div>
+              <label for="template">Grammar</label>
+              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+            </div>
          </fieldset>

          <fieldset class="two">
@@ -383,10 +611,71 @@
              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
            </fieldset>
+            <fieldset>
+              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
+            </fieldset>
          </details>
        </form>
      `
    }
+
+    const probColor = (p) => {
+      const r = Math.floor(192 * (1 - p));
+      const g = Math.floor(192 * p);
+      return `rgba(${r},${g},0,0.3)`;
+    }
+
+    const Probabilities = (params) => {
+      return params.data.map(msg => {
+        const { completion_probabilities } = msg;
+        if (
+          !completion_probabilities ||
+          completion_probabilities.length === 0
+        ) return msg.content
+
+        if (completion_probabilities.length > 1) {
+          // Not for byte pair
+          if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
+
+          const splitData = completion_probabilities.map(prob => ({
+            content: prob.content,
+            completion_probabilities: [prob]
+          }))
+          return html`<${Probabilities} data=${splitData} />`
+        }
+
+        const { probs, content } = completion_probabilities[0]
+        const found = probs.find(p => p.tok_str === msg.content)
+        const pColor = found ? probColor(found.prob) : 'transparent'
+
+        const popoverChildren = html`
+          <div class="prob-set">
+            ${probs.map((p, index) => {
+              return html`
+                <div
+                  key=${index}
+                  title=${`prob: ${p.prob}`}
+                  style=${{
+                    padding: '0.3em',
+                    backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
+                  }}
+                >
+                  <span>${p.tok_str}: </span>
+                  <span>${Math.floor(p.prob * 100)}%</span>
+                </div>
+              `
+            })}
+          </div>
+        `
+
+        return html`
+          <${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
+            ${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
+          </>
+        `
+      });
+    }
+
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
@@ -415,10 +704,121 @@
      `
    }

+    // simple popover impl
+    const Popover = (props) => {
+      const isOpen = useSignal(false);
+      const position = useSignal({ top: '0px', left: '0px' });
+      const buttonRef = useRef(null);
+      const popoverRef = useRef(null);
+
+      const togglePopover = () => {
+        if (buttonRef.current) {
+          const rect = buttonRef.current.getBoundingClientRect();
+          position.value = {
+            top: `${rect.bottom + window.scrollY}px`,
+            left: `${rect.left + window.scrollX}px`,
+          };
+        }
+        isOpen.value = !isOpen.value;
+      };
+
+      const handleClickOutside = (event) => {
+        if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
+          isOpen.value = false;
+        }
+      };
+
+      useEffect(() => {
+        document.addEventListener('mousedown', handleClickOutside);
+        return () => {
+          document.removeEventListener('mousedown', handleClickOutside);
+        };
+      }, []);
+
+      return html`
+        <span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
+        ${isOpen.value && html`
+          <${Portal} into="#portal">
+            <div
+              ref=${popoverRef}
+              class="popover-content"
+              style=${{
+                top: position.value.top,
+                left: position.value.left,
+              }}
+            >
+              ${props.popoverChildren}
+            </div>
+          </${Portal}>
+        `}
+      `;
+    };
+
+    // Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
+    /** Redirect rendering of descendants into the given CSS selector */
+    class Portal extends Component {
+      componentDidUpdate(props) {
+        for (let i in props) {
+          if (props[i] !== this.props[i]) {
+            return setTimeout(this.renderLayer);
+          }
+        }
+      }
+
+      componentDidMount() {
+        this.isMounted = true;
+        this.renderLayer = this.renderLayer.bind(this);
+        this.renderLayer();
+      }
+
+      componentWillUnmount() {
+        this.renderLayer(false);
+        this.isMounted = false;
+        if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
+      }
+
+      findNode(node) {
+        return typeof node === 'string' ? document.querySelector(node) : node;
+      }
+
+      renderLayer(show = true) {
+        if (!this.isMounted) return;
+
+        // clean up old node if moving bases:
+        if (this.props.into !== this.intoPointer) {
+          this.intoPointer = this.props.into;
+          if (this.into && this.remote) {
+            this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
+          }
+          this.into = this.findNode(this.props.into);
+        }
+
+        this.remote = render(html`
+          <${PortalProxy} context=${this.context}>
+            ${show && this.props.children || null}
+          </${PortalProxy}>
+        `, this.into, this.remote);
+      }
+
+      render() {
+        return null;
+      }
+    }
+    // high-order component that renders its first child if it exists.
+    // used as a conditional rendering proxy.
+    class PortalProxy extends Component {
+      getChildContext() {
+        return this.props.context;
+      }
+      render({ children }) {
+        return children || null;
+      }
+    }
+
    function App(props) {

      return html`
-        <div id="container">
+        <div>
          <header>
            <h1>llama.cpp</h1>
          </header>
@@ -439,11 +839,13 @@
      `;
    }

-    render(h(App), document.body);
+    render(h(App), document.querySelector('#container'));
  </script>
 </head>

 <body>
+  <div id="container"></div>
+  <div id="portal"></div>
 </body>

 </html>
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -0,0 +1,112 @@
+const SPACE_RULE = '" "?';
+
+const PRIMITIVE_RULES = {
+  boolean: '("true" | "false") space',
+  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  string: ` "\\"" (
+        [^"\\\\] |
+        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\\"" space`,
+  null: '"null" space',
+};
+
+const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
+const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
+const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
+
+export class SchemaConverter {
+  constructor(propOrder) {
+    this._propOrder = propOrder || {};
+    this._rules = new Map();
+    this._rules.set('space', SPACE_RULE);
+  }
+
+  _formatLiteral(literal) {
+    const escaped = JSON.stringify(literal).replace(
+      GRAMMAR_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+    return `"${escaped}"`;
+  }
+
+  _addRule(name, rule) {
+    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
+    let key = escName;
+
+    if (this._rules.has(escName)) {
+      if (this._rules.get(escName) === rule) {
+        return key;
+      }
+
+      let i = 0;
+      while (this._rules.has(`${escName}${i}`)) {
+        i += 1;
+      }
+      key = `${escName}${i}`;
+    }
+
+    this._rules.set(key, rule);
+    return key;
+  }
+
+  visit(schema, name) {
+    const schemaType = schema.type;
+    const ruleName = name || 'root';
+
+    if (schema.oneOf || schema.anyOf) {
+      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
+        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
+      ).join(' | ');
+
+      return this._addRule(ruleName, rule);
+    } else if ('const' in schema) {
+      return this._addRule(ruleName, this._formatLiteral(schema.const));
+    } else if ('enum' in schema) {
+      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'object' && 'properties' in schema) {
+      // TODO: `required` keyword (from python implementation)
+      const propOrder = this._propOrder;
+      const propPairs = Object.entries(schema.properties).sort((a, b) => {
+        // sort by position in prop_order (if specified) then by key
+        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
+        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
+        return orderA - orderB || a[0].localeCompare(b[0]);
+      });
+
+      let rule = '"{" space';
+      propPairs.forEach(([propName, propSchema], i) => {
+        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
+        if (i > 0) {
+          rule += ' "," space';
+        }
+        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
+      });
+      rule += ' "}" space';
+
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'array' && 'items' in schema) {
+      // TODO `prefixItems` keyword (from python implementation)
+      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
+      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
+      return this._addRule(ruleName, rule);
+    } else {
+      if (!PRIMITIVE_RULES[schemaType]) {
+        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
+      }
+      return this._addRule(
+        ruleName === 'root' ? 'root' : schemaType,
+        PRIMITIVE_RULES[schemaType]
+      );
+    }
+  }
+
+  formatGrammar() {
+    let grammar = '';
+    this._rules.forEach((rule, name) => {
+      grammar += `${name} ::= ${rule}\n`;
+    });
+    return grammar;
+  }
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -14,6 +15,7 @@
 #include "index.html.hpp"
 #include "index.js.hpp"
 #include "completion.js.hpp"
+#include "json-schema-to-grammar.mjs.hpp"

 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -92,7 +94,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
    std::string ret;
    for (; begin != end; ++begin)
    {
-        ret += llama_token_to_str(ctx, *begin);
+        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
@@ -121,9 +123,10 @@ static void server_log(const char *level, const char *function, int line,
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
-    // if first bit is 1, meaning it's a partial character
-    if (out.size() > 0 && (out[0] & 0x80) == 0x80)
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
+    // if the size is 1 and first bit is 1, meaning it's a partial character
+    //   (size > 1 meaning it's already a known token)
+    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
    {
        std::stringstream ss;
        ss << std::hex << (out[0] & 0xff);
@@ -188,6 +191,7 @@ struct llama_server_context
    size_t n_past = 0;
    size_t n_remain = 0;

+    json prompt;
    std::vector<llama_token> embd;
    std::vector<llama_token> last_n_tokens;

@@ -195,6 +199,9 @@ struct llama_server_context
    llama_context *ctx = nullptr;
    gpt_params params;

+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *grammar = nullptr;
+
    bool truncated = false;
    bool stopped_eos = false;
    bool stopped_word = false;
@@ -226,6 +233,7 @@ struct llama_server_context
    void rewind()
    {
        params.antiprompt.clear();
+        params.grammar.clear();
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
@@ -237,9 +245,13 @@ struct llama_server_context
        stopped_limit = false;
        stopping_word = "";
        multibyte_pending = 0;
-
        n_remain = 0;
        n_past = 0;
+
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
+        }
    }

    bool loadModel(const gpt_params &params_)
@@ -257,10 +269,80 @@ struct llama_server_context
        return true;
    }

+    std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+
+        if (json_prompt.is_array())
+        {
+            bool first = true;
+            for (const auto& p : json_prompt)
+            {
+                if (p.is_string())
+                {
+                    auto s = p.template get<std::string>();
+                    std::vector<llama_token> p;
+                    if (first)
+                    {
+                        p = ::llama_tokenize(ctx, s, add_bos);
+                        first = false;
+                    }
+                    else
+                    {
+                        p = ::llama_tokenize(ctx, s, false);
+                    }
+                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                }
+                else
+                {
+                    if (first)
+                    {
+                        first = false;
+                    }
+                    prompt_tokens.push_back(p.template get<llama_token>());
+                }
+            }
+        }
+        else
+        {
+            auto s = json_prompt.template get<std::string>();
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+        }
+
+        return prompt_tokens;
+    }
+
+    bool loadGrammar()
+    {
+        if (!params.grammar.empty()) {
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
+                }
+            }
+
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
+        return true;
+    }
+
    void loadPrompt()
    {
-        params.prompt.insert(0, 1, ' '); // always add a first space
-        std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
+
        num_prompt_tokens = prompt_tokens.size();

        if (params.n_keep < 0)
@@ -367,7 +449,7 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos();
+            result.tok = llama_token_eos(ctx);
            return result;
        }

@@ -407,7 +489,7 @@ struct llama_server_context
            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};

            // Apply penalties
-            float nl_logit = logits[llama_token_nl()];
+            float nl_logit = logits[llama_token_nl(ctx)];
            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -417,7 +499,11 @@ struct llama_server_context
                                                          last_n_repeat, alpha_frequency, alpha_presence);
            if (!penalize_nl)
            {
-                logits[llama_token_nl()] = nl_logit;
+                logits[llama_token_nl(ctx)] = nl_logit;
+            }
+
+            if (grammar != nullptr) {
+                llama_sample_grammar(ctx, &candidates_p, grammar);
            }

            if (temp <= 0)
@@ -457,10 +543,15 @@ struct llama_server_context
                }
            }

+            if (grammar != nullptr) {
+                llama_grammar_accept_token(ctx, grammar, result.tok);
+            }
+
            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
            {
                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
            }
+
            last_n_tokens.erase(last_n_tokens.begin());
            last_n_tokens.push_back(result.tok);
            num_tokens_predicted++;
@@ -471,9 +562,9 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;

-        if (!embd.empty() && embd.back() == llama_token_eos())
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
-            // stopping_word = llama_token_to_str(ctx, embd.back());
+            // stopping_word = llama_token_to_piece(ctx, embd.back());
            has_next_token = false;
            stopped_eos = true;
            LOG_VERBOSE("eos token found", {});
@@ -520,7 +611,7 @@ struct llama_server_context
    {
        const completion_token_output token_with_probs = nextToken();

-        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(ctx, token_with_probs.tok);
+        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;

        if (params.n_probs > 0)
@@ -608,8 +699,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -623,14 +712,17 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    {
        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
    fprintf(stdout, "                        number of layers to store in VRAM\n");
    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@@ -726,23 +818,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
-        else if (arg == "-gqa" || arg == "--gqa")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_gqa = std::stoi(argv[i]);
-        }
-        else if (arg == "-eps" || arg == "--rms-norm-eps") {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rms_norm_eps = std::stof(argv[i]);
-        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
@@ -827,7 +902,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--low-vram" || arg == "-lv")
@@ -835,7 +910,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
+#endif // GGML_USE_CUBLAS
+        }
+        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
+        {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = false;
+#else
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--main-gpu" || arg == "-mg")
@@ -886,6 +969,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        {
            params.use_mmap = false;
        }
+        else if (arg == "--numa")
+        {
+            params.numa = true;
+        }
        else if (arg == "--embedding")
        {
            params.embedding = true;
@@ -908,7 +995,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,

 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);

@@ -936,6 +1023,7 @@ static json format_generation_settings(llama_server_context &llama)
        {"stream", llama.stream},
        {"logit_bias", llama.params.logit_bias},
        {"n_probs", llama.params.n_probs},
+        {"grammar", llama.params.grammar},
    };
 }

@@ -953,7 +1041,7 @@ static json format_timings(llama_server_context &llama)
    assert(timings.n_eval == llama.num_tokens_predicted);

    return json{
-        {"prompt_n", timings.n_eval},
+        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
        {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
        {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
@@ -975,14 +1063,13 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"tokens_predicted", llama.num_tokens_predicted},
        {"tokens_evaluated", llama.num_prompt_tokens},
        {"generation_settings", format_generation_settings(llama)},
-        {"prompt", llama.params.prompt},
+        {"prompt", llama.prompt},
        {"truncated", llama.truncated},
        {"stopped_eos", llama.stopped_eos},
        {"stopped_word", llama.stopped_word},
        {"stopped_limit", llama.stopped_limit},
        {"stopping_word", llama.stopping_word},
        {"tokens_cached", llama.n_past},
-        {"tokens_predicted", llama.num_tokens_predicted},
        {"timings", format_timings(llama)},
    };

@@ -1015,34 +1102,58 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
        {"tokens", tokens}};
 }

+static json format_detokenized_response(std::string content)
+{
+    return json{
+        {"content", content}};
+}
+
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;

-    llama.stream = body.value("stream", false);
-    llama.params.n_predict = body.value("n_predict", default_params.n_predict);
-    llama.params.top_k = body.value("top_k", default_params.top_k);
-    llama.params.top_p = body.value("top_p", default_params.top_p);
-    llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
-    llama.params.typical_p = body.value("typical_p", default_params.typical_p);
-    llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
-    llama.params.temp = body.value("temperature", default_params.temp);
-    llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
-    llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
-    llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
-    llama.params.mirostat = body.value("mirostat", default_params.mirostat);
-    llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
-    llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
-    llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
-    llama.params.n_keep = body.value("n_keep", default_params.n_keep);
-    llama.params.seed = body.value("seed", default_params.seed);
-    llama.params.prompt = body.value("prompt", default_params.prompt);
-    llama.params.n_probs = body.value("n_probs", default_params.n_probs);
+    llama.stream = json_value(body, "stream", false);
+    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
+    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
+    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
+    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
+    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
+    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
+    llama.params.temp = json_value(body, "temperature", default_params.temp);
+    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
+    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
+    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
+    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
+    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
+    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
+    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
+    llama.params.seed = json_value(body, "seed", default_params.seed);
+    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
+    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
+
+    if (body.count("prompt") != 0)
+    {
+        llama.prompt = body["prompt"];
+    }
+    else
+    {
+        llama.prompt = "";
+    }

    llama.params.logit_bias.clear();
-    if (body.value("ignore_eos", false))
+    if (json_value(body, "ignore_eos", false))
    {
-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }

    const auto &logit_bias = body.find("logit_bias");
@@ -1102,6 +1213,62 @@ static void log_server_request(const Request &req, const Response &res)
                           });
 }

+bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
+    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
+}
+
+// Function matching type llama_beam_search_callback_fn_t.
+// Custom callback example is called each time the beams lengths increase:
+//  * Show progress by printing ',' following by number of convergent beam tokens if any.
+//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
+//    This is also called when the stop condition is met.
+//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
+void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
+    auto & llama = *static_cast<llama_server_context*>(callback_data);
+    // Mark beams as EOS as needed.
+    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
+        llama_beam_view& beam_view = beams_state.beam_views[i];
+        if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
+            beam_view.eob = true;
+        }
+    }
+    printf(",");  // Show progress
+    if (const size_t n = beams_state.common_prefix_length) {
+        llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
+        assert(0u < beams_state.n_beams);
+        const llama_token * tokens = beams_state.beam_views[0].tokens;
+        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
+        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
+        printf("%lu", n);
+    }
+    fflush(stdout);
+#if 0 // DEBUG: print current beams for this iteration
+    std::cout << "\n\nCurrent beams:\n";
+    for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
+        std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
+    }
+#endif
+}
+
+struct token_translator {
+    llama_context * ctx;
+    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
+    std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
+};
+
+void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
+    auto & gtps = llama.generated_token_probs;
+    auto translator = token_translator{llama.ctx};
+    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
+    const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
+    if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
+        llama.generated_text.reserve(llama.generated_text.size() + len);
+    }
+    for (const completion_token_output & cto : gtps) {
+        llama.generated_text += translator(cto);
+    }
+}
+
 int main(int argc, char **argv)
 {
    // own arguments required by this example
@@ -1158,6 +1325,12 @@ int main(int argc, char **argv)
        res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
        return false; });

+    // this is only called if no index.html is found in the public --path
+    svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
+            {
+        res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+        return false; });
+
    svr.Post("/completion", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
@@ -1168,26 +1341,40 @@ int main(int argc, char **argv)

        parse_options_completion(json::parse(req.body), llama);

+        if (!llama.loadGrammar())
+        {
+            res.status = 400;
+            return;
+        }
+
        llama.loadPrompt();
        llama.beginCompletion();

        if (!llama.stream) {
-            size_t stop_pos = std::string::npos;
+            if (llama.params.n_beams) {
+                // Fill llama.generated_token_probs vector with final beam.
+                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
+                                  llama.n_past, llama.n_remain, llama.params.n_threads);
+                // Translate llama.generated_token_probs to llama.generated_text.
+                append_to_generated_text_from_generated_token_probs(llama);
+            } else {
+                size_t stop_pos = std::string::npos;

-            while (llama.has_next_token) {
-                const completion_token_output token_with_probs = llama.doCompletion();
-                const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
+                while (llama.has_next_token) {
+                    const completion_token_output token_with_probs = llama.doCompletion();
+                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(llama.ctx, token_with_probs.tok);

-                stop_pos = llama.findStoppingStrings(llama.generated_text,
-                    token_text.size(), STOP_FULL);
-            }
+                    stop_pos = llama.findStoppingStrings(llama.generated_text,
+                        token_text.size(), STOP_FULL);
+                }

-            if (stop_pos == std::string::npos) {
-                stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
-            }
-            if (stop_pos != std::string::npos) {
-                llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
-                    llama.generated_text.end());
+                if (stop_pos == std::string::npos) {
+                    stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
+                }
+                if (stop_pos != std::string::npos) {
+                    llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
+                        llama.generated_text.end());
+                }
            }

            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
@@ -1203,59 +1390,86 @@ int main(int argc, char **argv)

                while (llama.has_next_token) {
                    const completion_token_output token_with_probs = llama.doCompletion();
-                    const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
-                    if (llama.multibyte_pending > 0) {
+                    if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
                        continue;
                    }
+                    const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);

                    size_t pos = std::min(sent_count, llama.generated_text.size());

                    const std::string str_test = llama.generated_text.substr(pos);
+                    bool is_stop_full = false;
                    size_t stop_pos =
                        llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
                    if (stop_pos != std::string::npos) {
+                        is_stop_full = true;
                        llama.generated_text.erase(
                            llama.generated_text.begin() + pos + stop_pos,
                            llama.generated_text.end());
                        pos = std::min(sent_count, llama.generated_text.size());
                    } else {
+                        is_stop_full = false;
                        stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
                            STOP_PARTIAL);
                    }

-                    const std::string to_send = llama.generated_text.substr(pos, stop_pos);
-                    sent_count += to_send.size();
+                    if (
+                        stop_pos == std::string::npos ||
+                        // Send rest of the text if we are at the end of the generation
+                        (!llama.has_next_token && !is_stop_full && stop_pos > 0)
+                    ) {
+                        const std::string to_send = llama.generated_text.substr(pos, std::string::npos);

-                    std::vector<completion_token_output> probs_output = {};
+                        sent_count += to_send.size();

-                    if (llama.params.n_probs > 0) {
-                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
-                        if (probs_pos < probs_stop_pos) {
-                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                        std::vector<completion_token_output> probs_output = {};
+
+                        if (llama.params.n_probs > 0) {
+                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
+                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
+                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
+                            if (probs_pos < probs_stop_pos) {
+                                probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
+                            }
+                            sent_token_probs_index = probs_stop_pos;
+                        }
+
+                        const json data = format_partial_response(llama, to_send, probs_output);
+
+                        const std::string str =
+                            "data: " +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            "\n\n";
+
+                        LOG_VERBOSE("data stream", {
+                            { "to_send", str }
+                        });
+
+                        if (!sink.write(str.data(), str.size())) {
+                            LOG_VERBOSE("stream closed", {});
+                            llama_print_timings(llama.ctx);
+                            return false;
                        }
-                        sent_token_probs_index = probs_stop_pos;
                    }

-                    const json data = llama.has_next_token
-                                          ? format_partial_response(llama, to_send, probs_output)
-                                          // Generation is done, send extra information.
-                                          : format_final_response(llama, to_send, llama.generated_token_probs);
+                    if (!llama.has_next_token) {
+                        // Generation is done, send extra information.
+                        const json data = format_final_response(llama, "", llama.generated_token_probs);

-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
+                        const std::string str =
+                            "data: " +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            "\n\n";

-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
+                        LOG_VERBOSE("data stream", {
+                            { "to_send", str }
+                        });

-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
+                        if (!sink.write(str.data(), str.size())) {
+                            LOG_VERBOSE("stream closed", {});
+                            llama_print_timings(llama.ctx);
+                            return false;
+                        }
                    }
                }

@@ -1263,7 +1477,11 @@ int main(int argc, char **argv)
                sink.done();
                return true;
            };
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
+            const auto on_complete = [&](bool) {
+                llama.mutex.unlock();
+            };
+            lock.release();
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        } });

    svr.Get("/model.json", [&llama](const Request &, Response &res)
@@ -1279,11 +1497,29 @@ int main(int argc, char **argv)
        auto lock = llama.lock();

        const json body = json::parse(req.body);
-        const std::string content = body.value("content", "");
-        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
+        std::vector<llama_token> tokens;
+        if (body.count("content") != 0)
+        {
+            tokens = llama.tokenize(body["content"], false);
+        }
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json"); });

+    svr.Post("/detokenize", [&llama](const Request &req, Response &res)
+             {
+        auto lock = llama.lock();
+
+        const json body = json::parse(req.body);
+        std::string content;
+        if (body.count("tokens") != 0)
+        {
+            const std::vector<llama_token> tokens = body["tokens"];
+            content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+        }
+
+        const json data = format_detokenized_response(content);
+        return res.set_content(data.dump(), "application/json"); });
+
    svr.Post("/embedding", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
@@ -1292,7 +1528,14 @@ int main(int argc, char **argv)

        llama.rewind();
        llama_reset_timings(llama.ctx);
-        llama.params.prompt = body.value("content", "");
+        if (body.count("content") != 0)
+        {
+            llama.prompt = body["content"];
+        }
+        else
+        {
+            llama.prompt = "";
+        }
        llama.params.n_predict = 0;
        llama.loadPrompt();
        llama.beginCompletion();
@@ -1319,8 +1562,12 @@ int main(int argc, char **argv)

    svr.set_error_handler([](const Request &, Response &res)
                          {
-        res.set_content("File Not Found", "text/plain");
-        res.status = 404; });
+        if (res.status == 400) {
+            res.set_content("Invalid request", "text/plain");
+        } else if (res.status != 500) {
+            res.set_content("File Not Found", "text/plain");
+            res.status = 404;
+        } });

    // set timeouts and change hostname and port
    svr.set_read_timeout(sparams.read_timeout);
@@ -1348,6 +1595,9 @@ int main(int argc, char **argv)
        return 1;
    }

+    if (llama.grammar != nullptr) {
+        llama_grammar_free(llama.grammar);
+    }
    llama_backend_free();

    return 0;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -2,180 +2,129 @@
 #define _GNU_SOURCE
 #endif

-#include "common.h"
-#include "llama.h"
 #include "build-info.h"

-#include <cassert>
-#include <cinttypes>
+#include "common.h"
+#include "llama.h"
+
 #include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-
-
-int main(int argc, char ** argv)
-{
+int main(int argc, char ** argv) {
    gpt_params params;

-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc == 1 || argv[1][0] == '-' )
-    {
-        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }

-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    if ( argc >= 2 )
-    {
+    if (argc >= 2) {
        params.model = argv[1];
    }

-    if ( argc >= 3 )
-    {
+    if (argc >= 3) {
        params.prompt = argv[2];
    }

-    if ( params.prompt.empty() )
-    {
+    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
+    // init LLM

    llama_backend_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_context_params ctx_params = llama_context_default_params();

-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int max_context_size     = llama_n_ctx( ctx );
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_tokens_list_size = max_context_size - 4;

-    if ( (int)tokens_list.size() > max_tokens_list_size )
-    {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+    if ((int) tokens_list.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }

-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");

-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
    }

-    fflush(stdout);
+    fflush(stderr);

-
-    //---------------------------------
-    // Main prediction loop :
-    //---------------------------------
+    // main loop

    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.

-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    {
-        //---------------------------------
-        // Evaluate the tokens :
-        //---------------------------------
+    const int n_gen = std::min(32, max_context_size);

-        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        {
-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
+        // evaluate the transformer
+
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }

        tokens_list.clear();

-        //---------------------------------
-        // Select the best prediction :
-        //---------------------------------
+        // sample the next token

        llama_token new_token_id = 0;

-        auto logits  = llama_get_logits( ctx );
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto logits  = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);

        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);

-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
-        {
-            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // Select it using the "Greedy sampling" method :
-        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
-
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
-        {
+        if (new_token_id == llama_token_eos(ctx)) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

-        // Print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
-        fflush( stdout );
+        // print the new token :
+        printf("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+        fflush(stdout);

-        // Push this new token for next evaluation :
-        tokens_list.push_back( new_token_id );
+        // push this new token for next evaluation
+        tokens_list.push_back(new_token_id);
+    }

-    } // wend of main loop
-
-    llama_free( ctx );
-    llama_free_model( model );
+    llama_free(ctx);
+    llama_free_model(model);

    llama_backend_free();

+    fprintf(stderr, "\n\n");
+
    return 0;
 }
-
-// EOF
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "common.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@@ -16,7 +17,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+static const float rms_norm_eps = 1e-5f;

 struct random_normal_distribution {
    std::mt19937 gen;
@@ -169,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
+    using ttype = llama_token_type;

-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
        float score;
+        ttype type;
    };

    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };

 struct my_llama_hparams {
@@ -1865,10 +1868,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false));        assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false));        assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
        t04->grad = expand(gb, ggml_add_inplace(ctx0,
                        ggml_add_inplace(ctx0,
@@ -1961,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {


 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_to_piece(ctx, token).c_str());
 }

 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -1995,7 +1998,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
    }
 }

-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    int n_tokens = tokens_input->ne[0];
    int n_vocab  = target_logits->ne[0];

@@ -2004,7 +2007,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons

    ggml_set_f32(target_logits, -1.0f/n_vocab);
    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
    for (int i=1; i<n_tokens+1; ++i) {
        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
        set_f32_2d(target_logits, token, i-1, +1.0f);
@@ -2015,7 +2018,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
    }
 }

-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    GGML_ASSERT(tokens_input->n_dims  == 2);
    GGML_ASSERT(target_logits->n_dims == 3);
    GGML_ASSERT(target_probs->n_dims  == 3);
@@ -2035,7 +2038,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
        GGML_ASSERT(sample+n_tokens-1 < n_train_data);

-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
        for (int i=1; i<n_tokens+1; ++i) {
            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
            // print_token(lctx, token);
@@ -2188,11 +2191,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
    f.read_raw(buf.data(), f.size);
    buf[f.size] = '\0';

-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+    if (n_tokens < 0) {
+        out.resize(-n_tokens);
+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
    }

    bool verify = false;
@@ -2200,17 +2202,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
        const char * in  = buf.data();
        const char * end = buf.data() + buf.size();
        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
+            std::string s = llama_token_to_piece(lctx, out[i]);
+            int len = s.length();
            if (in >= end) {
                printf("%s: unexpected end of original text.\n", __func__);
                break;
            }
-            const bool matches = (strncmp(in, s, len) == 0);
+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
            if (matches) {
                in += len;
            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
            }
        }
    }
@@ -2294,7 +2296,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
    const auto params = sampler->params;

    // Apply penalties
-    const float nl_logit = logits[llama_token_nl()];
+    const float nl_logit = logits[llama_token_nl(ctx)];

    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);

@@ -2313,7 +2315,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
        params.alpha_presence);

    if (!params.penalize_nl) {
-        logits[llama_token_nl()] = nl_logit;
+        logits[llama_token_nl(ctx)] = nl_logit;
    }

    llama_token token = 0;
@@ -2612,42 +2614,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        return;
    }

-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//    // write_vocab
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output);
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
 }

 float cosine_decay(const int decay_steps, const float alpha, int step) {
@@ -3052,20 +3057,13 @@ int main(int argc, char ** argv) {

    struct llama_vocab vocab;
    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
        vocab.id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
        }
    }

@@ -3178,7 +3176,7 @@ int main(int argc, char ** argv) {
    std::vector<int> train_samples;
    train_samples.push_back(0);
    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
            train_samples.push_back(i);
        }
    }
@@ -3338,7 +3336,7 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);

-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
+        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
        for (int i=sample_ctx; i<n_tokens; ++i) {
            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
        }
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
        "systems": "systems"
      },
      "locked": {
-        "lastModified": 1685518550,
-        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
+        "lastModified": 1692799911,
+        "narHash": "sha256-3eihraek4qL744EvQXsK1Ha6C3CR7nnT8X2qWap4RNk=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
+        "rev": "f9e7cf818399d17d347f847525c5a5a8032e4e44",
        "type": "github"
      },
      "original": {
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1685931219,
-        "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
+        "lastModified": 1692913444,
+        "narHash": "sha256-1SvMQm2DwofNxXVtNWWtIcTh7GctEVrS/Xel/mdc6iY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
+        "rev": "18324978d632ffc55ef1d928e81630c620f4f447",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -6,6 +6,9 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
+        name = "llama.cpp";
+        src = ./.;
+        meta.mainProgram = "llama";
        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        buildInputs = with pkgs; [ openmpi ];
        osSpecific = with pkgs; buildInputs ++
@@ -14,8 +17,6 @@
            with pkgs.darwin.apple_sdk_11_0.frameworks; [
              Accelerate
              MetalKit
-              MetalPerformanceShaders
-              MetalPerformanceShadersGraph
            ]
          else if isAarch32 && isDarwin then
            with pkgs.darwin.apple_sdk.frameworks; [
@@ -23,11 +24,17 @@
              CoreGraphics
              CoreVideo
            ]
+          else if isDarwin then
+            with pkgs.darwin.apple_sdk.frameworks; [
+              Accelerate
+              CoreGraphics
+              CoreVideo
+            ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
+        nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        postPatch = ''
@@ -40,35 +47,35 @@
          mv $out/bin/server $out/bin/llama-server
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
-      in {
+      in
+      {
        packages.default = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
-          src = ./.;
-          postPatch = postPatch;
-          nativeBuildInputs = nativeBuildInputs;
-          buildInputs = osSpecific;
+          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
-              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-              "-DLLAMA_METAL=ON"
-            ] else [
-              "-DLLAMA_BLAS=ON"
-              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
+            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+            "-DLLAMA_METAL=ON"
+          ] else [
+            "-DLLAMA_BLAS=ON"
+            "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
-          postInstall = postInstall;
-          meta.mainProgram = "llama";
        };
        packages.opencl = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
-          src = ./.;
-          postPatch = postPatch;
-          nativeBuildInputs = nativeBuildInputs;
+          inherit name src meta postPatch nativeBuildInputs postInstall;
          buildInputs = with pkgs; buildInputs ++ [ clblast ];
          cmakeFlags = cmakeFlags ++ [
            "-DLLAMA_CLBLAST=ON"
          ];
-          postInstall = postInstall;
-          meta.mainProgram = "llama";
+        };
+        packages.rocm = pkgs.stdenv.mkDerivation {
+          inherit name src meta postPatch nativeBuildInputs postInstall;
+          buildInputs = with pkgs; buildInputs ++ [ hip hipblas rocblas ];
+          cmakeFlags = cmakeFlags ++ [
+            "-DLLAMA_HIPBLAS=1"
+            "-DCMAKE_C_COMPILER=hipcc"
+            "-DCMAKE_CXX_COMPILER=hipcc"
+            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
+          ];
        };
        apps.llama-server = {
          type = "app";
@@ -82,8 +89,13 @@
          type = "app";
          program = "${self.packages.${system}.default}/bin/llama";
        };
+        apps.quantize = {
+          type = "app";
+          program = "${self.packages.${system}.default}/bin/quantize";
+        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
+          buildInputs = [ llama-python ];
          packages = nativeBuildInputs ++ osSpecific;
        };
      });
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -0,0 +1,593 @@
+#include "ggml-alloc.h"
+#include "ggml.h"
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED(x) (void)(x)
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
+//#define GGML_ALLOCATOR_DEBUG
+
+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
+struct hash_node {
+    struct ggml_tensor * t;
+    int n_children;
+    int n_views;
+};
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
+    size_t h = hash(t);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i].t != NULL) {
+        if (hash_table[i].t == t) {
+            return &hash_table[i];
+        }
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    hash_table[i].t = t;
+    return &hash_table[i];
+}
+
+// TODO: GGML_PAD ?
+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
+    return offset + align;
+}
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+#define MAX_FREE_BLOCKS 128
+
+struct ggml_allocr {
+    void * data;
+    size_t size;
+    size_t alignment;
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+    size_t max_size;
+    bool measure;
+    int parse_seq[GGML_MAX_CONCUR];
+    int parse_seq_len;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+#ifdef GGML_ALLOCATOR_DEBUG
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == NULL) {
+            alloc->allocated_tensors[i] = tensor;
+            return;
+        }
+    }
+    GGML_ASSERT(!"out of allocated_tensors");
+}
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == tensor ||
+            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
+            alloc->allocated_tensors[i] = NULL;
+            return;
+        }
+    }
+    printf("tried to free tensor %s not found\n", tensor->name);
+    GGML_ASSERT(!"tensor not found");
+}
+#endif
+
+
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+
+    UNUSED(alloc);
+}
+
+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+
+    size_t max_avail = 0;
+
+    // find the best fitting free block besides the last block
+    int best_fit_block = -1;
+    size_t best_fit_size = SIZE_MAX;
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size && block->size <= best_fit_size) {
+            best_fit_block = i;
+            best_fit_size = block->size;
+        }
+    }
+
+    AT_PRINTF("block %d\n", best_fit_block);
+
+    if (best_fit_block == -1) {
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+            max_avail = MAX(max_avail, block->size);
+        } else {
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
+            GGML_ASSERT(!"not enough space in the buffer");
+        return;
+        }
+    }
+    struct free_block * block = &alloc->free_blocks[best_fit_block];
+    void * addr = block->addr;
+    block->addr = (char*)block->addr + size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        alloc->n_free_blocks--;
+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
+        }
+    }
+
+    tensor->data = addr;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, tensor);
+    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    if (cur_max > alloc->max_size) {
+        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        for (int i = 0; i < 1024; i++) {
+            if (alloc->allocated_tensors[i]) {
+                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
+}
+
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+        // the tensor was not allocated in this buffer
+        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
+        // the easiest way to deal with this is just to ignore it
+        return;
+    }
+
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, tensor);
+#endif
+
+    // see if we can merge with an existing block
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        // check if ptr is at the end of the block
+        if ((char*)block->addr + block->size == ptr) {
+            block->size += size;
+            // check if we can merge with the next block
+            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
+                block->size += alloc->free_blocks[i+1].size;
+                alloc->n_free_blocks--;
+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+        // check if ptr is at the beginning of the block
+        if ((char*)ptr + size == block->addr) {
+            block->addr = ptr;
+            block->size += size;
+            // check if we can merge with the previous block
+            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
+                alloc->free_blocks[i-1].size += block->size;
+                alloc->n_free_blocks--;
+                for (int j = i; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+    }
+    // otherwise, add a new block
+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].addr = ptr;
+    alloc->free_blocks[insert_pos].size = size;
+    alloc->n_free_blocks++;
+}
+
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
+    for (int i = 0; i < n; i++) {
+        alloc->parse_seq[i] = list[i];
+    }
+    alloc->parse_seq_len = n;
+}
+
+void ggml_allocr_reset(struct ggml_allocr * alloc) {
+    alloc->n_free_blocks = 1;
+    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
+    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
+    alloc->free_blocks[0].size = alloc->size - align_offset;
+}
+
+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ data,
+        /*.size          = */ size,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.parse_seq_len = */ 0,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ = {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+// address and size of the buffer when measuring
+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ true,
+        /*.parse_seq     = */ {0},
+        /*.parse_seq_len = */ 0,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ = {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+void ggml_allocr_free(struct ggml_allocr * alloc) {
+    free(alloc);
+}
+
+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
+    return alloc->measure;
+}
+
+//////////// compute graph allocator
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
+static bool ggml_op_can_inplace(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_SCALE:
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_UNARY:
+        case GGML_OP_ROPE:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SET:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_CONT:
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
+    struct hash_node * ht = alloc->hash_table;
+    if (node->data == NULL) {
+        if (ggml_is_view(node)) {
+            size_t offset;
+            switch(node->op) {
+                case GGML_OP_VIEW:
+                    memcpy(&offset, node->op_params, sizeof(size_t));
+                    node->data = (char *) node->src[0]->data + offset;
+                    break;
+                case GGML_OP_PERMUTE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_TRANSPOSE:
+                    node->data = node->src[0]->data;
+                    break;
+                case GGML_OP_CPY:
+                    node->data = node->src[1]->data;
+                    break;
+                default:
+                    GGML_ASSERT(!"unknown view op");
+                    break;
+            }
+        } else {
+            // see if we can reuse a parent's buffer (inplace)
+            if (ggml_op_can_inplace(node->op)) {
+                for (int i = 0; i < GGML_MAX_SRC; i++) {
+                    struct ggml_tensor * parent = node->src[i];
+                    if (parent == NULL) {
+                        break;
+                    }
+
+                    // if the node's data is external, then we cannot re-use it
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
+
+                    struct hash_node * p_hn = hash_get(ht, parent);
+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
+                        if (ggml_is_view(parent)) {
+                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
+                                // the parent's data that it will need later (same layout requirement). the problem is that then
+                                // we cannot free the tensor because the original address of the allocation is lost.
+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->data = parent->data;
+                                return;
+                            }
+                        }
+                        else {
+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->data = parent->data;
+                            return;
+                        }
+                    }
+                }
+            }
+            ggml_allocr_alloc(alloc, node);
+        }
+    }
+}
+
+static size_t ggml_allocator_alloc_graph_tensors_n(
+    struct ggml_allocr * alloc,
+    struct ggml_cgraph ** graphs, int n_graphs,
+    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
+
+    // reset hash table
+    struct hash_node * ht = alloc->hash_table;
+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
+
+    // count number of children and views
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * view_src = get_view_source(node);
+                hash_get(ht, view_src)->n_views += 1;
+            }
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                hash_get(ht, parent)->n_children += 1;
+            }
+        }
+    }
+
+    // allocate tensors
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
+        // graph inputs are allocated first to ensure that they are not overwritten by each other
+        if (inputs != NULL && inputs[g] != NULL) {
+            for (int i = 0; inputs[g][i] != NULL; i++) {
+                struct ggml_tensor * input = inputs[g][i];
+                AT_PRINTF("input: %s\n", input->name);
+                allocate_node(alloc, input);
+            }
+        }
+        // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
+        int last_barrier_pos = 0;
+        int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
+
+        for (int ind = 0; ind < n_nodes; ind++) {
+            // allocate a node if there is no parse_seq or this is not a barrier
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
+                int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
+                struct ggml_tensor * node = gf->nodes[i];
+
+                // allocate parents (leafs)
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    allocate_node(alloc, parent);
+                }
+
+                // allocate node
+                allocate_node(alloc, node);
+
+                AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+                for (int j = 0; j < GGML_MAX_SRC; j++) {
+                    struct ggml_tensor * parent = node->src[j];
+                    if (parent == NULL) {
+                        break;
+                    }
+                    AT_PRINTF("%s", parent->name);
+                    if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                        AT_PRINTF(", ");
+                    }
+                }
+                AT_PRINTF("\n");
+            }
+
+
+            // update parents
+            // update immediately if there is no parse_seq
+            // update only at barriers if there is parse_seq
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
+                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
+                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
+                for (int i = update_start; i < update_end; i++) {
+                    int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
+                    struct ggml_tensor * node = gf->nodes[node_i];
+
+                    for (int j = 0; j < GGML_MAX_SRC; j++) {
+                        struct ggml_tensor * parent = node->src[j];
+                        if (parent == NULL) {
+                            break;
+                        }
+                        struct hash_node * p_hn = hash_get(ht, parent);
+                        p_hn->n_children -= 1;
+
+                        //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+
+                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                            if (ggml_is_view(parent)) {
+                                struct ggml_tensor * view_src = get_view_source(parent);
+                                struct hash_node * view_src_hn = hash_get(ht, view_src);
+                                view_src_hn->n_views -= 1;
+                                AT_PRINTF("view_src %s\n", view_src->name);
+                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+                                    ggml_allocator_free_tensor(alloc, view_src);
+                                }
+                            }
+                            else {
+                                if (parent->data != node->data) {
+                                    ggml_allocator_free_tensor(alloc, parent);
+                                }
+                            }
+                        }
+                    }
+                }
+                AT_PRINTF("\n");
+                if (alloc->parse_seq_len) {
+                    last_barrier_pos = ind + 1;
+                }
+            }
+        }
+        // free graph outputs here that wouldn't be freed otherwise because they have no children
+        if (outputs != NULL && outputs[g] != NULL) {
+            for (int i = 0; outputs[g][i] != NULL; i++) {
+                struct ggml_tensor * output = outputs[g][i];
+                AT_PRINTF("output: %s\n", output->name);
+                ggml_allocator_free_tensor(alloc, output);
+            }
+        }
+    }
+
+    return alloc->max_size;
+}
+
+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
+
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -2,34 +2,44 @@

 #include "ggml.h"

+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

 #define GGML_CUDA_MAX_DEVICES       16

-void   ggml_init_cublas(void);
-void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_init_cublas(void);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);

-void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);

-// TODO: export these with GGML_API
-void * ggml_cuda_host_malloc(size_t size);
-void   ggml_cuda_host_free(void * ptr);
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);

-void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);

-void   ggml_cuda_free_data(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-void   ggml_cuda_set_main_device(int main_device);
-void   ggml_cuda_set_scratch_size(size_t scratch_size);
-void   ggml_cuda_free_scratch(void);
-bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 #ifdef  __cplusplus
 }
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -24,6 +24,7 @@

 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 16
+#define GGML_METAL_MAX_COMMAND_BUFFERS 32

 struct ggml_tensor;
 struct ggml_cgraph;
@@ -38,6 +39,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);

+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

@@ -63,10 +67,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *

 // try to find operations that can be run concurrently in the graph
 // you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);

-// if the graph has been optimized for concurrently dispatch
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);

 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -5,7 +5,11 @@
 #import <Foundation/Foundation.h>

 #import <Metal/Metal.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))

 #ifdef GGML_METAL_NDEBUG
 #define metal_printf(...)
@@ -15,6 +19,8 @@

 #define UNUSED(x) (void)(x)

+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
 struct ggml_metal_buffer {
    const char * name;

@@ -27,16 +33,19 @@ struct ggml_metal_buffer {
 struct ggml_metal_context {
    int n_cb;

-    float * logits;
-
    id<MTLDevice>       device;
    id<MTLCommandQueue> queue;
    id<MTLLibrary>      library;

+    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
+    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
+
+    dispatch_queue_t d_queue;
+
    int n_buffers;
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];

-    int concur_list[GGML_MAX_NODES];
+    int concur_list[GGML_MAX_CONCUR];
    int concur_list_len;

    // custom kernels
@@ -57,6 +66,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_f16);
    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DECL_KERNEL(get_rows_q8_0);
    GGML_METAL_DECL_KERNEL(get_rows_q2_K);
    GGML_METAL_DECL_KERNEL(get_rows_q3_K);
    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
@@ -67,11 +77,21 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q8_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
    GGML_METAL_DECL_KERNEL(rope);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -97,19 +117,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {

    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));

-    ctx->n_cb   = n_cb;
+    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;

-    // determine if we can use MPS
-    if (MPSSupportsMTLDevice(ctx->device)) {
-        fprintf(stderr, "%s: using MPS\n", __func__);
-    } else {
-        fprintf(stderr, "%s: not using MPS\n", __func__);
-        GGML_ASSERT(false && "MPS not supported");
-    }
+    ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);

 #if 0
    // compile from source string and show compile log
@@ -119,7 +133,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }
    }
 #else
@@ -137,7 +151,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }

 #ifdef GGML_QKK_64
@@ -149,17 +163,24 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #endif
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }
    }
 #endif

    // load kernels
    {
+        NSError * error = nil;
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
-        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+        fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
+                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
+                (int) ctx->pipeline_##name.threadExecutionWidth); \
+        if (error) { \
+            fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            return NULL; \
+        }

        GGML_METAL_ADD_KERNEL(add);
        GGML_METAL_ADD_KERNEL(add_row);
@@ -174,6 +195,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_f16);
        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
+        GGML_METAL_ADD_KERNEL(get_rows_q8_0);
        GGML_METAL_ADD_KERNEL(get_rows_q2_K);
        GGML_METAL_ADD_KERNEL(get_rows_q3_K);
        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
@@ -184,11 +206,21 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
        GGML_METAL_ADD_KERNEL(rope);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -198,12 +230,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #undef GGML_METAL_ADD_KERNEL
    }

-    fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-    fprintf(stderr, "%s: hasUnifiedMemory             = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+    fprintf(stderr, "%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    fprintf(stderr, "%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
    if (ctx->device.maxTransferRate != 0) {
-        fprintf(stderr, "%s: maxTransferRate              = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        fprintf(stderr, "%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
    } else {
-        fprintf(stderr, "%s: maxTransferRate              = built-in GPU\n", __func__);
+        fprintf(stderr, "%s: maxTransferRate               = built-in GPU\n", __func__);
    }

    return ctx;
@@ -211,21 +243,95 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {

 void ggml_metal_free(struct ggml_metal_context * ctx) {
    fprintf(stderr, "%s: deallocating\n", __func__);
+#define GGML_METAL_DEL_KERNEL(name) \
+    [ctx->function_##name release]; \
+    [ctx->pipeline_##name release];
+
+    GGML_METAL_DEL_KERNEL(add);
+    GGML_METAL_DEL_KERNEL(add_row);
+    GGML_METAL_DEL_KERNEL(mul);
+    GGML_METAL_DEL_KERNEL(mul_row);
+    GGML_METAL_DEL_KERNEL(scale);
+    GGML_METAL_DEL_KERNEL(silu);
+    GGML_METAL_DEL_KERNEL(relu);
+    GGML_METAL_DEL_KERNEL(gelu);
+    GGML_METAL_DEL_KERNEL(soft_max);
+    GGML_METAL_DEL_KERNEL(diag_mask_inf);
+    GGML_METAL_DEL_KERNEL(get_rows_f16);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DEL_KERNEL(get_rows_q8_0);
+    GGML_METAL_DEL_KERNEL(get_rows_q2_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q3_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
+    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DEL_KERNEL(rms_norm);
+    GGML_METAL_DEL_KERNEL(norm);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(rope);
+    GGML_METAL_DEL_KERNEL(alibi_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
+    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
+
+#undef GGML_METAL_DEL_KERNEL
+
    for (int i = 0; i < ctx->n_buffers; ++i) {
        [ctx->buffers[i].metal release];
    }
+
+    [ctx->library release];
+    [ctx->queue release];
+    [ctx->device release];
+
+    dispatch_release(ctx->d_queue);
+
    free(ctx);
 }

-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
-    ctx->n_cb = n_cb;
+void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    if (result != 0) {
+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
 }

-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
-    if (ctx->concur_list_len) {
-        return true;
-    }
-    return false;
+void ggml_metal_host_free(void * data) {
+    free(data);
+}
+
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
+    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
+}
+
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+    return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+    return ctx->concur_list;
 }

 // finds the Metal buffer that contains the tensor data on the GPU device
@@ -368,17 +474,17 @@ void ggml_metal_get_tensor(

 void ggml_metal_graph_find_concurrency(
        struct ggml_metal_context * ctx,
-        struct ggml_cgraph * gf) {
+        struct ggml_cgraph * gf, bool check_mem) {
    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
-    int nodes_unused[GGML_MAX_NODES];
+    int nodes_unused[GGML_MAX_CONCUR];

-    for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
-    for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
+    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
+    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
    ctx->concur_list_len = 0;

-    int n_left = gf->n_nodes;
-    int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
-    int level_pos = 0;  // at ctx->concur_list, the last layer (level) ends at level_pos
+    int n_left    = gf->n_nodes;
+    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
+    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos

    while (n_left > 0) {
        // number of nodes at a layer (that can be issued concurrently)
@@ -386,28 +492,40 @@ void ggml_metal_graph_find_concurrency(
        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
            if (nodes_unused[i]) {
                // if the requirements for gf->nodes[i] are satisfied
-                int exe_flag=1;
+                int exe_flag = 1;
+
                // scan all srcs
                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
                    if (src_cur) {
                        // if is leaf nodes it's satisfied.
-                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
+                        // TODO: ggml_is_leaf()
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
+                            continue;
+                        }

                        // otherwise this src should be the output from previous nodes.
                        int is_found = 0;
+
                        // scan 2*search_depth back because we inserted barrier.
-                        for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
-                            if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
+                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
+                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
+                                is_found = 1;
+                                break;
+                            }
+                        }
+                        if (is_found == 0) {
+                            exe_flag = 0;
+                            break;
                        }
-                        if (is_found == 0) {exe_flag = 0; break;}
                    }
                }
-                if (exe_flag) {
+                if (exe_flag && check_mem) {
                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
                    int64_t data_start = (int64_t) gf->nodes[i]->data;
-                    int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
+                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
                    for (int j = n_start; j < i; j++) {
                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
                                            && gf->nodes[j]->op != GGML_OP_VIEW \
@@ -416,9 +534,9 @@ void ggml_metal_graph_find_concurrency(
                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
                                continue;
-                            } else {
-                                exe_flag = 0;
                            }
+
+                            exe_flag = 0;
                        }
                    }
                }
@@ -435,11 +553,13 @@ void ggml_metal_graph_find_concurrency(
        ctx->concur_list[level_pos + concurrency] = -1;
        ctx->concur_list_len++;
        // jump all sorted nodes at nodes_bak
-        while (!nodes_unused[n_start]) {n_start++;}
+        while (!nodes_unused[n_start]) {
+            n_start++;
+        }
        level_pos += concurrency + 1;
    }

-    if (ctx->concur_list_len > GGML_MAX_NODES) {
+    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
        fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
    }
 }
@@ -449,11 +569,13 @@ void ggml_metal_graph_compute(
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);

+    @autoreleasepool {
+
    // if there is ctx->concur_list, dispatch concurrently
    // else fallback to serial dispatch
    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;

-    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
+    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;

    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
@@ -463,41 +585,33 @@ void ggml_metal_graph_compute(

    const int n_cb = ctx->n_cb;

-    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
-
    for (int i = 0; i < n_cb; ++i) {
-        command_buffers[i] = [ctx->queue commandBuffer];
+        ctx->command_buffers[i] = [ctx->queue commandBuffer];

        // enqueue the command buffers in order to specify their execution order
-        [command_buffers[i] enqueue];
-    }
+        [ctx->command_buffers[i] enqueue];

-    // TODO: is this the best way to start threads?
-    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
+        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
+    }

    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;

-        dispatch_async(queue, ^{
+        dispatch_async(ctx->d_queue, ^{
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;

-            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
+            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];

-            id<MTLComputeCommandEncoder> encoder = nil;
-
-            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);

            for (int ind = node_start; ind < node_end; ++ind) {
                const int i = has_concur ? ctx->concur_list[ind] : ind;

                if (i == -1) {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                        continue;
-                    }
                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
                    continue;
                }
@@ -571,10 +685,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
@@ -592,10 +702,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@@ -613,10 +719,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SCALE:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const float scale = *(const float *) src1->data;

                            [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -632,10 +734,6 @@ void ggml_metal_graph_compute(
                        switch (ggml_get_unary_op(gf->nodes[i])) {
                            case GGML_UNARY_OP_SILU:
                                {
-                                    if (encoder == nil) {
-                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                    }
-
                                    [encoder setComputePipelineState:ctx->pipeline_silu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -646,10 +744,6 @@ void ggml_metal_graph_compute(
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
-                                    if (encoder == nil) {
-                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                    }
-
                                    [encoder setComputePipelineState:ctx->pipeline_relu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -660,10 +754,6 @@ void ggml_metal_graph_compute(
                                } break;
                            case GGML_UNARY_OP_GELU:
                                {
-                                    if (encoder == nil) {
-                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                    }
-
                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -680,10 +770,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int nth = 32;

                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
@@ -698,10 +784,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_DIAG_MASK_INF:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int n_past = ((int32_t *)(dst->op_params))[0];

                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
@@ -718,53 +800,44 @@ void ggml_metal_graph_compute(
                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224

                            GGML_ASSERT(ne00 == ne10);
-                            GGML_ASSERT(ne02 == ne12);
+                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
+                            uint gqa = ne12/ne02;
+                            GGML_ASSERT(ne03 == ne13);

+                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            if (ggml_is_contiguous(src0) &&
                                ggml_is_contiguous(src1) &&
-                                (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
-
-                                if (encoder != nil) {
-                                    [encoder endEncoding];
-                                    encoder = nil;
-                                }
-
-                                MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-                                MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-
-                                // for F32 x F32 we use MPS
-                                MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
-
-                                MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
-
-                                MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
-
-                                MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
-                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
-                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
-
-                                // we need to do ne02 multiplications
-                                // TODO: is there a way to do this in parallel - currently very slow ..
-                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
-                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
-                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
-                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
-                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;
-
-                                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
-                                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
-                                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
-
-                                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                                src1t == GGML_TYPE_F32 &&
+                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne00%32 == 0 &&
+                                ne11 > 1) {
+                                switch (src0->type) {
+                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
+                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
+                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                    case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q8_0_f32]; break;
+                                    case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
+                                    case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
+                                    case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
+                                    case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
+                                    case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
+                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
                                }
+                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
+                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
+                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
+                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
+                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:8];
+                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:9];
+                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:10];
+                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
-                                if (encoder == nil) {
-                                    encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                }
-
                                int nth0 = 32;
                                int nth1 = 1;

@@ -772,8 +845,6 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
-                                            GGML_ASSERT(ne02 == ne12);
-
                                            nth0 = 64;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -796,6 +867,15 @@ void ggml_metal_graph_compute(
                                            nth1 = 8;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
                                        } break;
+                                    case GGML_TYPE_Q8_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
+                                        } break;
                                    case GGML_TYPE_Q2_K:
                                        {
                                            GGML_ASSERT(ne02 == 1);
@@ -853,33 +933,36 @@ void ggml_metal_graph_compute(
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
+                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];

-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                                }
                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -888,14 +971,11 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_GET_ROWS:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            switch (src0->type) {
-                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
+                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
+                                case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q8_0]; break;
                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
@@ -917,10 +997,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));

@@ -940,20 +1016,17 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_NORM:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
-                            const float eps = 1e-5f;
+                            float eps;
+                            memcpy(&eps, dst->op_params, sizeof(float));

                            const int nth = 256;

                            [encoder setComputePipelineState:ctx->pipeline_norm];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];

                            const int64_t nrows = ggml_nrows(src0);
@@ -962,10 +1035,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ALIBI:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            GGML_ASSERT((src0t == GGML_TYPE_F32));

                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
@@ -1000,15 +1069,13 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
                            [encoder setBytes:&m0  length:sizeof(    float) atIndex:18];
+
                            const int nth = 32;
+
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_ROPE:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int n_past = ((int32_t *) dst->op_params)[0];
                            const int n_dims = ((int32_t *) dst->op_params)[1];
                            const int mode   = ((int32_t *) dst->op_params)[2];
@@ -1019,8 +1086,8 @@ void ggml_metal_graph_compute(
                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

                            [encoder setComputePipelineState:ctx->pipeline_rope];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
@@ -1049,10 +1116,6 @@ void ggml_metal_graph_compute(
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int nth = 32;

                            switch (src0t) {
@@ -1075,24 +1138,24 @@ void ggml_metal_graph_compute(
                                default: GGML_ASSERT(false && "not implemented");
                            }

-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@@ -1114,17 +1177,19 @@ void ggml_metal_graph_compute(
    }

    // wait for all threads to finish
-    dispatch_barrier_sync(queue, ^{});
-
-    [command_buffers[n_cb - 1] waitUntilCompleted];
+    dispatch_barrier_sync(ctx->d_queue, ^{});

    // check status of command buffers
    // needed to detect if the device ran out-of-memory for example (#1881)
    for (int i = 0; i < n_cb; i++) {
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
+        [ctx->command_buffers[i] waitUntilCompleted];
+
+        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
            fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
            GGML_ASSERT(false);
        }
    }
+
+    }
 }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -183,6 +183,15 @@
 #    define GGML_API
 #endif

+// TODO: support for clang
+#ifdef __GNUC__
+#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define GGML_DEPRECATED(func, hint) func
+#endif
+
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
@@ -198,7 +207,7 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
-#define GGML_MAX_NAME          48
+#define GGML_MAX_NAME          64
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4

@@ -206,6 +215,11 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

+#define GGUF_MAGIC   0x46554747 // "GGUF"
+#define GGUF_VERSION 2
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
 #define GGML_UNUSED(x) (void)(x)

 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -246,8 +260,9 @@
 extern "C" {
 #endif

-#ifdef __ARM_NEON
-    // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@@ -331,10 +346,12 @@ extern "C" {
        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
+        GGML_OP_CONCAT,
        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
        GGML_OP_RMS_NORM_BACK,
+        GGML_OP_GROUP_NORM,

        GGML_OP_MUL_MAT,
        GGML_OP_OUT_PROD,
@@ -360,20 +377,29 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
+        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,

+        GGML_OP_UPSCALE, // nearest interpolate
+
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_WIN_PART,
        GGML_OP_WIN_UNPART,
+        GGML_OP_GET_REL_POS,
+        GGML_OP_ADD_REL_POS,

        GGML_OP_UNARY,

        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,

+        GGML_OP_MAP_CUSTOM1_F32,
+        GGML_OP_MAP_CUSTOM2_F32,
+        GGML_OP_MAP_CUSTOM3_F32,
+
        GGML_OP_MAP_CUSTOM1,
        GGML_OP_MAP_CUSTOM2,
        GGML_OP_MAP_CUSTOM3,
@@ -549,6 +575,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

    GGML_API int     ggml_blck_size (enum ggml_type type);
@@ -570,6 +597,8 @@ extern "C" {
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);

+    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);

@@ -784,6 +813,13 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // concat a and b on dim 2
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_concat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@@ -873,14 +909,15 @@ extern "C" {
            struct ggml_tensor  * b);

    // normalize along rows
-    // TODO: eps is hardcoded to 1e-5 for now
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);

    GGML_API struct ggml_tensor * ggml_norm_inplace(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);

    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
@@ -892,6 +929,19 @@ extern "C" {
            struct ggml_tensor  * a,
            float                 eps);

+    // group normalize along ne0*ne1*n_groups
+    // used in stable-diffusion
+    // TODO: eps is hardcoded to 1e-6 for now
+    GGML_API struct ggml_tensor * ggml_group_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
    // a - x
    // b - dy
    // TODO: update with configurable eps
@@ -1170,7 +1220,18 @@ extern "C" {
            int                   mode,
            int                   n_ctx);

-    // custom RoPE, in-place, returns view(a)
+    // custom RoPE
+    GGML_API struct ggml_tensor * ggml_rope_custom(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale);
+
+    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1181,6 +1242,15 @@ extern "C" {
            float                 freq_base,
            float                 freq_scale);

+    // xPos RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            float                 base,
+            bool                  down);
+
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1189,7 +1259,11 @@ extern "C" {
            int                   n_past,
            int                   n_dims,
            int                   mode,
-            int                   n_ctx);
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 xpos_base,
+            bool                  xpos_down);

    // alibi position embedding
    // in-place, returns view(a)
@@ -1216,6 +1290,15 @@ extern "C" {
            int                   p0,  // padding
            int                   d0); // dilation

+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);
+
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1227,14 +1310,38 @@ extern "C" {
            int                   d0,
            int                   d1);

-    // conv_1d with padding = half
-    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is equal to kernel size
+    // padding is zero
+    // example:
+    // a:     16   16    3  768
+    // b:   1024 1024    3    1
+    // res:   64   64  768    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is 1
+    // padding is half
+    // example:
+    // a:      3    3    256  256
+    // b:     64   64    256    1
+    // res:   64   64    256    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            int                   stride);

    enum ggml_op_pool {
        GGML_OP_POOL_MAX,
@@ -1242,7 +1349,7 @@ extern "C" {
        GGML_OP_POOL_COUNT,
    };

-    GGML_API struct ggml_tensor* ggml_pool_1d(
+    GGML_API struct ggml_tensor * ggml_pool_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            enum ggml_op_pool     op,
@@ -1250,7 +1357,7 @@ extern "C" {
            int                   s0, // stride
            int                   p0); // padding

-    GGML_API struct ggml_tensor* ggml_pool_2d(
+    GGML_API struct ggml_tensor * ggml_pool_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            enum ggml_op_pool     op,
@@ -1261,6 +1368,13 @@ extern "C" {
            int                   p0,
            int                   p1);

+    // nearest interpolate
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_upscale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   scale_factor);
+
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1304,15 +1418,6 @@ extern "C" {
            int                   h0,
            int                   w);

-    // custom operators
-
-    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-
    GGML_API struct ggml_tensor * ggml_unary(
            struct ggml_context * ctx,
             struct ggml_tensor * a,
@@ -1323,63 +1428,159 @@ extern "C" {
        struct ggml_tensor  * a,
        enum ggml_unary_op op);

-    GGML_API struct ggml_tensor * ggml_map_unary_f32(
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_get_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   qh,
+            int                   kh);
+
+    // used in sam
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    // custom operators
+
+    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun);
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1 instead");

-    GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun);
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_binary_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun);
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2 instead");

-    GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun);
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun);
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1 instead");

-    GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun);
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun);
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2 instead");

-    GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun);
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun);
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3 instead");

-    GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun);
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3_inplace instead");
+
+    // custom operators v2
+
+    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
+
+    #define GGML_N_TASKS_MAX -1
+
+    GGML_API struct ggml_tensor * ggml_map_custom1(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);

    // loss function

@@ -1611,6 +1812,127 @@ extern "C" {

    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

+    //
+    // gguf
+    //
+
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_UINT64  = 10,
+        GGUF_TYPE_INT64   = 11,
+        GGUF_TYPE_FLOAT64 = 12,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
+
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+
+    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
+
+    // results are undefined if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
+    GGML_API uint64_t     gguf_get_val_u64 (struct gguf_context * ctx, int i);
+    GGML_API int64_t      gguf_get_val_i64 (struct gguf_context * ctx, int i);
+    GGML_API double       gguf_get_val_f64 (struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
+
+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
+    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
+    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
+    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+    // manage tensor info
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
+
    //
    // system info
    //
@@ -1631,6 +1953,7 @@ extern "C" {
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
+    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_vsx        (void);

    //
@@ -1648,6 +1971,10 @@ extern "C" {
    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

    typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
        ggml_to_float_t   to_float;
        ggml_from_float_t from_float;
        ggml_from_float_t from_float_reference;
@@ -1655,7 +1982,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;

-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
--- a/gguf-py/LICENSE
+++ b/gguf-py/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Georgi Gerganov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -0,0 +1,55 @@
+## gguf
+
+This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
+(GGML Universal File) format.
+
+See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
+as an example for its usage.
+
+## Installation
+```sh
+pip install gguf
+```
+
+## Development
+Maintainers who participate in development of this package are advised to install it in editable mode:
+
+```sh
+cd /path/to/llama.cpp/gguf-py
+
+pip install --editable .
+```
+
+**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
+In this case, upgrade Pip to the latest:
+
+```sh
+pip install --upgrade pip
+```
+
+## Publishing
+To publish the package, you need to have `twine` and `build` installed:
+
+```sh
+pip install build twine
+```
+
+Then, folow these steps to release a new version:
+
+1. Update the version in `pyproject.toml`.
+2. Build the package:
+
+```sh
+python -m build
+```
+
+3. Upload the generated distribution archives:
+
+```sh
+python -m twine upload dist/*
+```
+
+## TODO
+- [ ] Add tests
+- [ ] Include conversion scripts as command line entry points in this package.
+- Add CI workflow for releasing the package.
--- a/gguf-py/gguf/init.py
+++ b/gguf-py/gguf/init.py
@@ -0,0 +1 @@
+from .gguf import *
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -0,0 +1,749 @@
+#!/usr/bin/env python3
+import shutil
+import sys
+import struct
+import tempfile
+import numpy as np
+
+from enum import IntEnum, auto
+from typing import Any, IO, List, Optional
+
+#
+# constants
+#
+
+GGUF_MAGIC             = 0x46554747
+GGUF_VERSION           = 2
+GGUF_DEFAULT_ALIGNMENT = 32
+
+# general
+KEY_GENERAL_ARCHITECTURE         = "general.architecture"
+KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
+KEY_GENERAL_ALIGNMENT            = "general.alignment"
+KEY_GENERAL_NAME                 = "general.name"
+KEY_GENERAL_AUTHOR               = "general.author"
+KEY_GENERAL_URL                  = "general.url"
+KEY_GENERAL_DESCRIPTION          = "general.description"
+KEY_GENERAL_LICENSE              = "general.license"
+KEY_GENERAL_SOURCE_URL           = "general.source.url"
+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
+KEY_GENERAL_FILE_TYPE            = "general.file_type"
+
+# LLM
+KEY_CONTEXT_LENGTH        = "{arch}.context_length"
+KEY_EMBEDDING_LENGTH      = "{arch}.embedding_length"
+KEY_BLOCK_COUNT           = "{arch}.block_count"
+KEY_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
+KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
+KEY_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+
+# attention
+KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
+KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
+KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
+KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
+KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
+KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+
+# RoPE
+KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
+KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
+
+# tokenization
+KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
+KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
+KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
+KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
+KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
+KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
+KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
+KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
+KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
+KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
+KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
+KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
+
+
+#
+# recommended mapping of model tensor names for storage in gguf
+#
+
+
+class MODEL_ARCH(IntEnum):
+    LLAMA   = auto()
+    FALCON  = auto()
+    GPT2    = auto()
+    GPTJ    = auto()
+    GPTNEOX = auto()
+    MPT     = auto()
+
+
+class MODEL_TENSOR(IntEnum):
+    TOKEN_EMBD    = auto()
+    POS_EMBD      = auto()
+    OUTPUT        = auto()
+    OUTPUT_NORM   = auto()
+    ROPE_FREQS    = auto()
+    ATTN_Q        = auto()
+    ATTN_K        = auto()
+    ATTN_V        = auto()
+    ATTN_QKV      = auto()
+    ATTN_OUT      = auto()
+    ATTN_NORM     = auto()
+    ATTN_NORM_2   = auto()
+    ATTN_ROT_EMBD = auto()
+    FFN_GATE      = auto()
+    FFN_DOWN      = auto()
+    FFN_UP        = auto()
+    FFN_NORM      = auto()
+
+
+MODEL_ARCH_NAMES = {
+    MODEL_ARCH.LLAMA:   "llama",
+    MODEL_ARCH.FALCON:  "falcon",
+    MODEL_ARCH.GPT2:    "gpt2",
+    MODEL_ARCH.GPTJ:    "gptj",
+    MODEL_ARCH.GPTNEOX: "gptneox",
+    MODEL_ARCH.MPT:     "mpt",
+}
+
+MODEL_TENSOR_NAMES = {
+    MODEL_ARCH.LLAMA: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPTNEOX: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.FALCON: {
+        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+        MODEL_TENSOR.OUTPUT:      "output",
+        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPT2: {
+        # TODO
+    },
+    # TODO
+}
+
+# tensors that will not be serialized
+MODEL_TENSOR_SKIP = {
+    MODEL_ARCH.LLAMA: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+}
+
+
+# TODO: the following helper functions should be removed
+#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
+#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
+# REMOVE
+def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
+    for skip in MODEL_TENSOR_SKIP.get(arch, []):
+        for i in range(n_blocks):
+            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
+                return True
+
+    return False
+
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
+    tensor_map = {}
+
+    # Token embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
+
+    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
+    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
+    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
+    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
+    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
+
+    # Position embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
+
+    tensor_map["transformer.wpe"] = mapped_to  # gpt2
+
+    # Output
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
+
+    tensor_map["embed_out"] = mapped_to  # gptneox
+    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
+    tensor_map["output"]    = mapped_to  # llama-pth
+
+    # Output norm
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
+
+    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
+    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
+    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
+    tensor_map["model.norm"]                = mapped_to  # llama-hf
+    tensor_map["norm"]                      = mapped_to  # llama-pth
+
+    # Rope frequencies
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
+
+    tensor_map["rope.freqs"] = mapped_to  # llama-pth
+
+    # Attention and feed-forward blocks
+    for i in range(0, n_blocks):
+        # Attention norm
+        # TODO: is there are simpler way to write these 2 lines in Python?
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
+        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
+        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
+
+        # Attention norm 2
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b
+
+        # Attention query-key-value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
+
+        # Attention query
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
+
+        # Attention key
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
+
+        # Attention value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
+
+        # Attention output
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
+
+        # Rotary embeddings
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
+
+        # Feed-forward norm
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
+        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
+
+        # Feed-forward up
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
+
+        # Feed-forward gate
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
+
+        # Feed-forward down
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
+
+    return tensor_map
+
+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
+#
+# implementation
+#
+
+
+class GGMLQuantizationType(IntEnum):
+    F32  = 0
+    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15
+
+
+class GGUFValueType(IntEnum):
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
+    FLOAT32 = 6
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
+    UINT64  = 10
+    INT64   = 11
+    FLOAT64 = 12
+
+    @staticmethod
+    def get_type(val):
+        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
+            return GGUFValueType.STRING
+        elif isinstance(val, list):
+            return GGUFValueType.ARRAY
+        elif isinstance(val, float):
+            return GGUFValueType.FLOAT32
+        elif isinstance(val, bool):
+            return GGUFValueType.BOOL
+        elif isinstance(val, int):
+            return GGUFValueType.INT32
+        # TODO: need help with 64-bit types in Python
+        else:
+            print("Unknown type: "+str(type(val)))
+            sys.exit()
+
+
+class GGUFWriter:
+    def __init__(self, path: str, arch: str, use_temp_file = True):
+        self.fout = open(path, "wb")
+        self.arch = arch
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = b""
+        self.kv_data_count = 0
+        self.ti_data = b""
+        self.ti_data_count = 0
+        self.add_architecture()
+        self.use_temp_file = use_temp_file
+        self.tensors = []
+
+    def write_header_to_file(self):
+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", GGUF_VERSION))
+        self.fout.write(struct.pack("<Q", self.ti_data_count))
+        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.flush()
+#        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
+
+    def write_kv_data_to_file(self):
+        self.fout.write(self.kv_data)
+        self.flush()
+
+    def write_ti_data_to_file(self):
+        self.fout.write(self.ti_data)
+        self.flush()
+
+    def add_key(self, key: str):
+        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+
+    def add_uint8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT8)
+
+    def add_int8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT8)
+
+    def add_uint16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT16)
+
+    def add_int16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT16)
+
+    def add_uint32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT32)
+
+    def add_int32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT32)
+
+    def add_float32(self, key: str, val: float):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT32)
+
+    def add_uint64(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT64)
+
+    def add_int64(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT64)
+
+    def add_float64(self, key: str, val: float):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT64)
+
+    def add_bool(self, key: str, val: bool):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.BOOL)
+
+    def add_string(self, key: str, val: str):
+        if len(val) == 0:
+            return
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.STRING)
+
+    def add_array(self, key: str, val: list):
+        if not isinstance(val, list):
+            raise ValueError("Value must be a list for array type")
+
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.ARRAY)
+
+    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
+        if vtype is None:
+            vtype = GGUFValueType.get_type(val)
+
+        if add_vtype:
+            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data_count += 1
+
+        if vtype == GGUFValueType.UINT8:
+            self.kv_data += struct.pack("<B", val)
+        elif vtype == GGUFValueType.INT8:
+            self.kv_data += struct.pack("<b", val)
+        elif vtype == GGUFValueType.UINT16:
+            self.kv_data += struct.pack("<H", val)
+        elif vtype == GGUFValueType.INT16:
+            self.kv_data += struct.pack("<h", val)
+        elif vtype == GGUFValueType.UINT32:
+            self.kv_data += struct.pack("<I", val)
+        elif vtype == GGUFValueType.INT32:
+            self.kv_data += struct.pack("<i", val)
+        elif vtype == GGUFValueType.FLOAT32:
+            self.kv_data += struct.pack("<f", val)
+        elif vtype == GGUFValueType.UINT64:
+            self.kv_data += struct.pack("<Q", val)
+        elif vtype == GGUFValueType.INT64:
+            self.kv_data += struct.pack("<q", val)
+        elif vtype == GGUFValueType.FLOAT64:
+            self.kv_data += struct.pack("<d", val)
+        elif vtype == GGUFValueType.BOOL:
+            self.kv_data += struct.pack("?", val)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
+            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY:
+            ltype = set([GGUFValueType.get_type(item) for item in val])
+            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
+            self.kv_data += struct.pack("<I", list(ltype)[0])
+            self.kv_data += struct.pack("<Q", len(val))
+            for item in val:
+                self.add_val(item, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type")
+
+    @staticmethod
+    def ggml_pad(x: int, n: int) -> int:
+        return ((x + n - 1) // n) * n
+
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+
+        encoded_name = name.encode("utf8")
+        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += encoded_name
+        n_dims = len(tensor_shape)
+        self.ti_data += struct.pack("<I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+        if raw_dtype is None:
+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
+        self.ti_data += struct.pack("<I", dtype)
+        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
+        self.ti_data_count += 1
+
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
+            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
+            self.temp_file.seek(0)
+
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+
+        if not self.use_temp_file:
+            self.tensors.append((tensor, pad))
+            return
+
+        tensor.tofile(self.temp_file)
+
+        if pad != 0:
+            self.temp_file.write(bytes([0] * pad))
+
+    def write_tensor_data(self, tensor: np.ndarray):
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        tensor.tofile(self.fout)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+    def write_tensors_to_file(self):
+        self.write_ti_data_to_file()
+
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        if not self.use_temp_file:
+            for (currtensor, currpad) in self.tensors:
+                currtensor.tofile(self.fout)
+                if currpad != 0:
+                    self.fout.write(bytes([0] * currpad))
+            return
+
+        self.temp_file.seek(0)
+
+        shutil.copyfileobj(self.temp_file, self.fout)
+        self.flush()
+        self.temp_file.close()
+
+    def flush(self):
+        self.fout.flush()
+
+    def close(self):
+        self.fout.close()
+
+    def add_architecture(self):
+        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
+
+    def add_author(self, author: str):
+        self.add_string(KEY_GENERAL_AUTHOR, author)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_url(self, url: str):
+        self.add_string(KEY_GENERAL_URL, url)
+
+    def add_description(self, description: str):
+        self.add_string(KEY_GENERAL_DESCRIPTION, description)
+
+    def add_source_url(self, url: str):
+        self.add_string(KEY_GENERAL_SOURCE_URL, url)
+
+    def add_source_hf_repo(self, repo: str):
+        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
+
+    def add_file_type(self, ftype: int):
+        self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
+
+    def add_name(self, name: str):
+        self.add_string(KEY_GENERAL_NAME, name)
+
+    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
+        self.add_uint32(
+            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
+
+    def add_custom_alignment(self, alignment: int):
+        self.data_alignment = alignment
+        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
+
+    def add_context_length(self, length: int):
+        self.add_uint32(
+            KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
+
+    def add_embedding_length(self, length: int):
+        self.add_uint32(
+            KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_block_count(self, length: int):
+        self.add_uint32(
+            KEY_BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int):
+        self.add_uint32(
+            KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_parallel_residual(self, use: bool):
+        self.add_bool(
+            KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(
+            KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_head_count(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
+
+    def add_max_alibi_bias(self, bias: float):
+        self.add_float32(
+            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+
+    def add_clamp_kqv(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
+
+    def add_layer_norm_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
+
+    def add_layer_norm_rms_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+
+    def add_rope_dimension_count(self, count: int):
+        self.add_uint32(
+            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
+
+    def add_rope_freq_base(self, value: float):
+        self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)
+
+    def add_rope_scale_linear(self, value: float):
+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
+
+    def add_tokenizer_model(self, model: str):
+        self.add_string(KEY_TOKENIZER_MODEL, model)
+
+    def add_token_list(self, tokens: List):
+        self.add_array(KEY_TOKENIZER_LIST, tokens)
+
+    def add_token_merges(self, merges: List):
+        self.add_array(KEY_TOKENIZER_MERGES, merges)
+
+    def add_token_types(self, types: List[int]):
+        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
+
+    def add_token_scores(self, scores: List[float]):
+        self.add_array(KEY_TOKENIZER_SCORES, scores)
+
+    def add_bos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
+
+    def add_eos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
+
+    def add_unk_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
+
+    def add_sep_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
+
+    def add_pad_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
+
+
+# Example usage:
+if __name__ == "__main__":
+    # Example usage with a file
+    gguf_writer = GGUFWriter("example.gguf", "llama")
+
+    gguf_writer.add_architecture()
+    gguf_writer.add_block_count(12)
+    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
+    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
+    gguf_writer.add_custom_alignment(64)
+
+    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
+    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
+    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
+
+    gguf_writer.add_tensor("tensor1", tensor1)
+    gguf_writer.add_tensor("tensor2", tensor2)
+    gguf_writer.add_tensor("tensor3", tensor3)
+
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+
+    gguf_writer.close()
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -0,0 +1,28 @@
+[tool.poetry]
+name = "gguf"
+version = "0.2.1"
+description = "Write ML models in GGUF for GGML"
+authors = ["GGML <ggml@ggml.ai>"]
+packages = [
+    {include = "gguf"},
+]
+readme = "README.md"
+homepage = "https://ggml.ai"
+repository = "https://github.com/ggerganov/llama.cpp"
+keywords = ["ggml", "gguf", "llama.cpp"]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+
+[tool.poetry.dependencies]
+python = ">=3.8"
+numpy = ">=1.17"
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/gguf-py/tests/test_gguf.py
+++ b/gguf-py/tests/test_gguf.py
@@ -0,0 +1,7 @@
+import gguf
+
+# TODO: add tests
+
+
+def test_write_gguf():
+    pass
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -0,0 +1,91 @@
+# GBNF Guide
+
+GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
+
+## Background
+
+[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+
+## Basics
+
+In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
+
+## Example
+
+Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
+```
+# `root` specifies the pattern for the overall output
+root ::= (
+    # it must start with the characters "1. " followed by a sequence
+    # of characters that match the `move` rule, followed by a space, followed
+    # by another move, and then a newline
+    "1. " move " " move "\n"
+
+    # it's followed by one or more subsequent moves, numbered with one or two digits
+    ([1-9] [0-9]? ". " move " " move "\n")+
+)
+
+# `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
+# The `[+#]?` denotes the possibility of checking or mate signs after moves
+move ::= (pawn | nonpawn | castle) [+#]?
+
+pawn ::= ...
+nonpawn ::= ...
+castle ::= ...
+```
+
+## Non-Terminals and Terminals
+
+Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
+
+Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
+
+## Characters and character ranges
+
+Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
+
+Character ranges can be negated with `^`:
+```
+single-line ::= [^\n]+ "\n"`
+```
+
+## Sequences and Alternatives
+
+The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
+
+Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
+
+Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
+
+## Repetition and Optional Symbols
+
+- `*` after a symbol or sequence means that it can be repeated zero or more times.
+- `+` denotes that the symbol or sequence should appear one or more times.
+- `?` makes the preceding symbol or sequence optional.
+
+## Comments and newlines
+
+Comments can be specified with `#`:
+```
+# defines optional whitspace
+ws ::= [ \t\n]+
+```
+
+Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
+
+## The root rule
+
+In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
+
+```
+# a grammar for lists
+root ::= ("- " item)+
+item ::= [^\n]+ "\n"
+```
+
+## Next steps
+
+This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
+```
+./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
+```
--- a/grammars/json.gbnf
+++ b/grammars/json.gbnf
@@ -1,29 +1,25 @@
-# Grammar for subset of JSON - doesn't support full string or number syntax
-
-root  ::= object
-value ::= object | array | string | number | boolean | "null"
+root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws

 object ::=
  "{" ws (
            string ":" ws value
    ("," ws string ":" ws value)*
-  )? "}"
+  )? "}" ws

 array  ::=
  "[" ws (
            value
    ("," ws value)*
-  )? "]"
+  )? "]" ws

-string  ::=
+string ::=
  "\"" (
    [^"\\] |
    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
  )* "\"" ws

-# Only plain integers currently
-number  ::= "-"? [0-9]+ ws
-boolean ::= ("true" | "false") ws
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws

 # Optional space: by convention, applied in this grammar after literal chars when allowed
 ws ::= ([ \t\n] ws)?
--- a/k_quants.c
+++ b/k_quants.c
@@ -39,6 +39,8 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

+#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
+
 //
 // 2-6 bit quantization in super-blocks
 //
@@ -75,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        }
        return 1/iscale;
    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
    int weight_type = rmse_type%2;
    float sumlx = 0;
    float suml2 = 0;
@@ -87,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        suml2 += w*l*l;
    }
    float scale = sumlx/suml2;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
    float best = scale * sumlx;
-    for (int itry = 0; itry < 3; ++itry) {
-        iscale = 1/scale;
-        float slx = 0;
-        float sl2 = 0;
-        bool changed = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            if (l + nmax != L[i]) { changed = true; }
-            float w = weight_type == 1 ? x[i] * x[i] : 1.f;
-            slx += w*x[i]*l;
-            sl2 += w*l*l;
-        }
-        if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-        }
-        sumlx = slx; suml2 = sl2;
-        scale = sumlx/suml2;
-        best = scale * sumlx;
-    }
-    for (int itry = 0; itry < 5; ++itry) {
-        int n_changed = 0;
-        for (int i = 0; i < n; ++i) {
-            float w = weight_type == 1 ? x[i]*x[i] : 1;
-            int l = L[i] - nmax;
-            float slx = sumlx - w*x[i]*l;
-            if (slx > 0) {
-                float sl2 = suml2 - w*l*l;
-                int new_l = nearest_int(x[i] * sl2 / slx);
-                new_l = MAX(-nmax, MIN(nmax-1, new_l));
-                if (new_l != l) {
-                    slx += w*x[i]*new_l;
-                    sl2 += w*new_l*new_l;
-                    if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
-                        L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
-                        scale = sumlx / suml2; best = scale * sumlx;
-                        ++n_changed;
-                    }
-                }
-            }
-        }
-        if (!n_changed) { break; }
-    }
-    if (rmse_type < 3) {
-        return scale;
-    }
-    for (int is = -4; is <= 4; ++is) {
+    for (int is = -9; is <= 9; ++is) {
        if (is == 0) {
            continue;
        }
@@ -219,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
    return 1/iscale;
 }

-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
+static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+        int ntry, float alpha) {
    float min = x[0];
    float max = x[0];
+    float sum_x = 0;
+    float sum_x2 = 0;
    for (int i = 1; i < n; ++i) {
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
+        sum_x += x[i];
+        sum_x2 += x[i]*x[i];
    }
    if (max == min) {
        for (int i = 0; i < n; ++i) L[i] = 0;
@@ -252,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
        for (int i = 0; i < n; ++i) {
            sum += x[i] - scale*L[i];
        }
-        min = sum/n;
+        min = alpha*min + (1 - alpha)*sum/n;
        if (min > 0) min = 0;
        iscale = 1/scale;
        if (!did_change) break;
@@ -261,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
    return scale;
 }

+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
 #if QK_K == 256
 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
    if (j < 4) {
@@ -279,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
    const int nb = k / QK_K;

    uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float   weights[16];
    float mins[QK_K/16];
    float scales[QK_K/16];

@@ -289,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
+            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
+            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@@ -635,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
    const int nb = k / QK_K;

    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    float   weights[32];
    float mins[QK_K/32];
    float scales[QK_K/32];

@@ -643,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/32; ++j) {
-            scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
+            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@@ -796,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
    uint8_t L[QK_K];
    float mins[QK_K/32];
    float scales[QK_K/32];
+    float weights[32];
+    uint8_t Laux[32];
 #else
    int8_t L[QK_K];
    float scales[QK_K/16];
@@ -808,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/32; ++j) {
-            scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
+            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@@ -1353,7 +1411,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};

        __m256i sumi = _mm256_setzero_si256();

@@ -1421,7 +1479,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));

        // sumf += -dmin * summs in 32bits*8
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);

        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1551,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        }

        // sumf += dall * isum - dmin * summs in 32bits
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
    }

@@ -1644,8 +1702,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        summs += dmin * smin;

        const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
-        const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
-        const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
+        const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
+        const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);

        const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
        const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1767,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
        const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));

-        const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
-        const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
-        const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
-        const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
+        const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
+        const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
+        const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
+        const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));

        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1975,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
-        const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
+        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};

        // high bit
        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2186,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        }

        // multiply with block scale and accumulate
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);

    }
@@ -2303,13 +2361,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        aux16[0] = a & 0x0f0f;
        aux16[1] = (a >> 4) & 0x0f0f;

-        const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
-        const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
+        const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
+        const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));

        memcpy(&aux64, x[i].hmask, 8);

        const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
-        __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
+        __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
        __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
        q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
        q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2376,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);

        // prepare low and high bits
-        const __m256i q3aux  = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
+        const __m256i q3aux  = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
        const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
        const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);

@@ -2429,7 +2487,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri

        p16_0 = _mm_add_epi32(p16_0, p16_2);
        p16_1 = _mm_add_epi32(p16_1, p16_3);
-        __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
+        __m256i p16 = MM256_SET_M128I(p16_1, p16_0);

        // multiply with block scale and accumulate
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2678,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);

        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);

        __m256i sumi = _mm256_setzero_si256();

@@ -2727,7 +2785,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        }

        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);

    }
@@ -2968,11 +3026,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri

        const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
        const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);

        const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
        const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
-        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);

    }

@@ -3160,7 +3218,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        summs += dmin * _mm_extract_epi32(hsum, 0);

        const __m128i sc128  = _mm256_extracti128_si256(mins_and_scales, 0);
-        const __m256i scales = _mm256_set_m128i(sc128, sc128);
+        const __m256i scales = MM256_SET_M128I(sc128, sc128);

        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
        __m256i hmask = mone;
@@ -3299,7 +3357,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        }

        __m256 vd = _mm256_set1_ps(d);
-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);

    }
@@ -3462,13 +3520,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri

        const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);

-        const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
-        const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
+        const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
+        const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));

        int64_t aux64;
        memcpy(&aux64, x[i].qh, 8);
        const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
-        const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
+        const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);

        const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
        const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3601,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
        const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));

-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);

    }

@@ -3925,7 +3983,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri

        }

-        __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
    }

@@ -4083,8 +4141,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
        const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);

-        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
-        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
+        const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
+        const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);

        const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
        const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4235,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
        sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));

-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
    }

    *s = hsum_float_8(acc);
--- a/llama-util.h
+++ b/llama-util.h
@@ -1,504 +0,0 @@
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
-#ifndef LLAMA_UTIL_H
-#define LLAMA_UTIL_H
-
-#include <cstdio>
-#include <cstdint>
-#include <cerrno>
-#include <cstring>
-#include <cstdarg>
-#include <cstdlib>
-#include <climits>
-
-#include <string>
-#include <vector>
-#include <stdexcept>
-
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/mman.h>
-        #endif
-        #if defined(_POSIX_MEMLOCK_RANGE)
-            #include <sys/resource.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <io.h>
-    #include <stdio.h> // for _fseeki64
-#endif
-
-#define LLAMA_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    LLAMA_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        LLAMA_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, len, 1, fp);
-        if (ferror(fp)) {
-            throw std::runtime_error(format("read error: %s", strerror(errno)));
-        }
-        if (ret != 1) {
-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t len) const {
-        if (len == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, len, 1, fp);
-        if (ret != 1) {
-            throw std::runtime_error(format("write error: %s", strerror(errno)));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-#if defined(_WIN32)
-static std::string llama_format_win_err(DWORD err) {
-    LPSTR buf;
-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
-    if (!size) {
-        return "FormatMessageA failed";
-    }
-    std::string ret(buf, size);
-    LocalFree(buf);
-    return ret;
-}
-#endif
-
-struct llama_mmap {
-    void * addr;
-    size_t size;
-
-    llama_mmap(const llama_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-        // prefetch/readahead impairs performance on NUMA systems
-        if (numa) { prefetch = 0; }
-#ifdef __linux__
-        if (prefetch) { flags |= MAP_POPULATE; }
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
-        }
-
-        if (prefetch > 0) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-        if (numa) {
-            // advise the kernel not to use readahead
-            // (because the next page might not belong on the same node)
-            if (madvise(addr, file->size, MADV_RANDOM)) {
-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~llama_mmap() {
-        munmap(addr, size);
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
-        (void) numa;
-
-        size = file->size;
-
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
-
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
-
-        if (hMapping == NULL) {
-            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
-        }
-
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
-        CloseHandle(hMapping);
-
-        if (addr == NULL) {
-            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
-        }
-
-        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        if (prefetch) {
-            // Advise the kernel to preload the mapped memory
-            WIN32_MEMORY_RANGE_ENTRY range;
-            range.VirtualAddress = addr;
-            range.NumberOfBytes = (SIZE_T)size;
-            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-            }
-        }
-        #else
-        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
-        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
-    }
-
-    ~llama_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
-        (void) prefetch;
-        (void) numa;
-
-        throw std::runtime_error(std::string("mmap not supported"));
-    }
-#endif
-};
-
-// Represents some region of memory being locked using mlock or VirtualLock;
-// will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
-    bool failed_already = false;
-
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
-
-    ~llama_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
-    }
-
-    void init(void * ptr) {
-        LLAMA_ASSERT(addr == NULL && size == 0);
-        addr = ptr;
-    }
-
-    void grow_to(size_t target_size) {
-        LLAMA_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
-    }
-
-#ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
-    }
-
-    #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
-    #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
-    #endif
-
-    bool raw_lock(const void * addr, size_t size) {
-        if (!mlock(addr, size)) {
-            return true;
-        } else {
-            char* errmsg = std::strerror(errno);
-            bool suggest = (errno == ENOMEM);
-
-            // Check if the resource limit is fine after all
-            struct rlimit lock_limit;
-            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
-                suggest = false;
-            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
-                suggest = false;
-
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-            return false;
-        }
-    }
-
-    #undef MLOCK_SUGGESTION
-
-    void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * ptr, size_t len) {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(ptr, len)) {
-                return true;
-            }
-            if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                    len, size, llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = len + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    void raw_unlock(void * ptr, size_t len) {
-        if (!VirtualUnlock(ptr, len)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    size_t lock_granularity() {
-        return (size_t) 65536;
-    }
-
-    bool raw_lock(const void * addr, size_t len) {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
-        return false;
-    }
-
-    void raw_unlock(const void * addr, size_t len) {}
-#endif
-};
-
-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct llama_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    llama_buffer() = default;
-
-    void resize(size_t len) {
-#ifdef GGML_USE_METAL
-        free(addr);
-        int result = posix_memalign((void **) &addr, getpagesize(), len);
-        if (result == 0) {
-            memset(addr, 0, len);
-        }
-        else {
-            addr = NULL;
-        }
-#else
-        delete[] addr;
-        addr = new uint8_t[len];
-#endif
-        size = len;
-    }
-
-    ~llama_buffer() {
-#ifdef GGML_USE_METAL
-        free(addr);
-#else
-        delete[] addr;
-#endif
-        addr = NULL;
-    }
-
-    // disable copy and move
-    llama_buffer(const llama_buffer&) = delete;
-    llama_buffer(llama_buffer&&) = delete;
-    llama_buffer& operator=(const llama_buffer&) = delete;
-    llama_buffer& operator=(llama_buffer&&) = delete;
-};
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-struct llama_ctx_buffer {
-    uint8_t * addr = NULL;
-    bool is_cuda;
-    size_t size = 0;
-
-    llama_ctx_buffer() = default;
-
-    void resize(size_t size) {
-        free();
-
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
-        if (addr) {
-            is_cuda = true;
-        }
-        else {
-            // fall back to pageable memory
-            addr = new uint8_t[size];
-            is_cuda = false;
-        }
-        this->size = size;
-    }
-
-    void free() {
-        if (addr) {
-            if (is_cuda) {
-                ggml_cuda_host_free(addr);
-            }
-            else {
-                delete[] addr;
-            }
-        }
-        addr = NULL;
-    }
-
-    ~llama_ctx_buffer() {
-        free();
-    }
-
-    // disable copy and move
-    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
-    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
-    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
-    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
-};
-#else
-typedef llama_buffer llama_ctx_buffer;
-#endif
-
-#endif
--- a/Show More
+++ b/Show More