server : avoid context swaps by shifting the KV cache

server : clear the KV cache beyond n_past before llama_decode
examples : do not eval prompt 2 times (close #3348 )
2026-04-16 16:27:32 +03:00 · 2023-09-28 19:03:36 +03:00 · 2023-09-28 18:12:39 +03:00 · 2023-09-28 17:48:46 +03:00 · 2023-09-28 17:32:04 +03:00 · 2023-09-28 16:05:37 +03:00
125 changed files with 7964 additions and 31782 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,9 +1,6 @@
 *.o
 *.a
 .cache/
-.git/
-.github/
-.gitignore
 .vs/
 .vscode/
 .DS_Store
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -38,13 +38,13 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          CC=gcc-8 make -j $(nproc)
+          CC=gcc-8 make

      - name: Test
        id: make_test
        run: |
-          CC=gcc-8 make tests -j $(nproc)
-          make test -j $(nproc)
+          CC=gcc-8 make tests
+          make test

  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@@ -66,7 +66,7 @@ jobs:
          mkdir build
          cd build
          cmake ..
-          cmake --build . --config Release -j $(nproc)
+          cmake --build . --config Release

      - name: Test
        id: cmake_test
@@ -101,7 +101,7 @@ jobs:
          mkdir build
          cd build
          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+          cmake --build . --config ${{ matrix.build_type }}

      - name: Test
        id: cmake_test
@@ -135,7 +135,7 @@ jobs:
          mkdir build
          cd build
          cmake -DLLAMA_MPI=ON ..
-          cmake --build . --config Release -j $(nproc)
+          cmake --build . --config Release

      - name: Test
        id: cmake_test
@@ -160,13 +160,13 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          make -j $(sysctl -n hw.logicalcpu)
+          make

      - name: Test
        id: make_test
        run: |
-          make tests -j $(sysctl -n hw.logicalcpu)
-          make test -j $(sysctl -n hw.logicalcpu)
+          make tests
+          make test

  macOS-latest-cmake:
    runs-on: macos-latest
@@ -188,8 +188,8 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake ..
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
+          cmake --build . --config Release

      - name: Test
        id: cmake_test
@@ -223,7 +223,7 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=iOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
+          cmake --build . --config Release

  macOS-latest-cmake-tvos:
    runs-on: macos-latest
@@ -251,35 +251,7 @@ jobs:
            -DLLAMA_BUILD_SERVER=OFF \
            -DCMAKE_SYSTEM_NAME=tvOS \
            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-
-      - name: Build Swift Example
-        id: make_build_swift_example
-        run: |
-            make swift
+          cmake --build . --config Release

  windows-latest-cmake:
    runs-on: windows-latest
@@ -293,17 +265,17 @@ jobs:
      matrix:
        include:
          - build: 'noavx'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx2'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'avx'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
          - build: 'avx512'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'

    steps:
      - name: Clone
@@ -352,7 +324,7 @@ jobs:
          mkdir build
          cd build
          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake --build . --config Release

      - name: Add clblast.dll
        id: add_clblast_dll
@@ -442,8 +414,8 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake --build . --config Release

      - name: Determine tag name
        id: tag
@@ -485,22 +457,22 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

-#  freeBSD-latest:
-#    runs-on: macos-12
-#    steps:
-#    - name: Clone
-#      uses: actions/checkout@v3
-#
-#    - name: Build
-#      uses: cross-platform-actions/action@v0.19.0
-#      with:
-#        operating_system: freebsd
-#        version: '13.2'
-#        hypervisor: 'qemu'
-#        run: |
-#            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
+  freeBSD-latest:
+    runs-on: macos-12
+    steps:
+    - name: Clone
+      uses: actions/checkout@v3
+
+    - name: Build
+      uses: cross-platform-actions/action@v0.19.0
+      with:
+        operating_system: freebsd
+        version: '13.2'
+        hypervisor: 'qemu'
+        run: |
+            sudo pkg update
+            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15

  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -36,9 +36,8 @@ jobs:
        poetry install

    - name: Build package
-      run: cd gguf-py && poetry build
+      run: poetry build
    - name: Publish package
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -1,25 +0,0 @@
-name: Zig CI
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-
-jobs:
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        runs-on: [ubuntu-latest, macos-latest, windows-latest]
-    runs-on: ${{ matrix.runs-on }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: recursive
-          fetch-depth: 0
-      - uses: goto-bus-stop/setup-zig@v2
-        with:
-          version: 0.11.0
-      - name: Build Summary
-        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,6 @@
 *.gcno
 *.gcda
 *.dot
-*.metallib
 .DS_Store
 .build/
 .cache/
@@ -41,14 +40,11 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
-/infill
 /libllama.so
 /llama-bench
-/llava
 /main
 /metal
 /perplexity
-/q8dot
 /quantize
 /quantize-stats
 /result
@@ -56,9 +52,6 @@ models-mnt
 /server
 /simple
 /batched
-/batched-bench
-/export-lora
-/finetune
 /speculative
 /parallel
 /train-text-from-scratch
@@ -94,5 +87,4 @@ tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
-tests/test-tokenizer-1-llama
-tests/test-tokenizer-1-bpe
+tests/test-tokenizer-1
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.13)  # for add_link_options
+cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
 project("llama.cpp" C CXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -44,7 +44,7 @@ endif()

 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)

 # debug
@@ -58,21 +58,15 @@ option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"
 option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)

 # instruction set specific
-if (LLAMA_NATIVE)
-    set(INS_ENB OFF)
-else()
-    set(INS_ENB ON)
-endif()
-
-option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
-option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
-option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
-option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
-option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
-option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
+option(LLAMA_AVX                        "llama: enable AVX"                                     ON)
+option(LLAMA_AVX2                       "llama: enable AVX2"                                    ON)
+option(LLAMA_AVX512                     "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI                "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI                "llama: enable AVX512-VNNI"                             OFF)
+option(LLAMA_FMA                        "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
-    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
+    option(LLAMA_F16C                   "llama: enable F16C"                                    ON)
 endif()

 # 3rd party libs
@@ -349,9 +343,8 @@ if (LLAMA_MPI)
        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
-        if (NOT MSVC)
-            add_compile_options(-Wno-cast-qual)
-        endif()
+        set(cxx_flags ${cxx_flags} -Wno-cast-qual)
+        set(c_flags   ${c_flags}   -Wno-cast-qual)
        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
        # Even if you're only using the C header, C++ programs may bring in MPI
@@ -421,56 +414,43 @@ endif()

 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
-        set(host_cxx_flags "")
-
-        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
-            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
-            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
-
-            if (
-                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
-                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
-            )
-                set(c_flags ${c_flags} -Wdouble-promotion)
-            endif()
-        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-            set(c_flags ${c_flags} -Wdouble-promotion)
-            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
-
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
-            endif()
-            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
-                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
-            endif()
+        set(c_flags
+            -Wall
+            -Wextra
+            -Wpedantic
+            -Wcast-qual
+            -Wdouble-promotion
+            -Wshadow
+            -Wstrict-prototypes
+            -Wpointer-arith
+            -Wmissing-prototypes
+            -Werror=implicit-int
+            -Wno-unused-function
+        )
+        set(cxx_flags
+            -Wall
+            -Wextra
+            -Wpedantic
+            -Wcast-qual
+            -Wmissing-declarations
+            -Wno-unused-function
+            -Wno-multichar
+        )
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+            # g++ only
+            set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
        endif()
    else()
        # todo : msvc
    endif()

-    set(c_flags   ${c_flags}   ${warning_flags})
-    set(cxx_flags ${cxx_flags} ${warning_flags})
-    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
-                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
-                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
+    add_compile_options(
+            "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+            "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+    )

 endif()

-if (NOT MSVC)
-    set(cuda_flags -Wno-pedantic)
-endif()
-set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
-
-list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
-if (NOT cuda_host_flags STREQUAL "")
-    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
-endif()
-
-add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
-
 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)

@@ -510,6 +490,9 @@ if (NOT MSVC)
    if (LLAMA_GPROF)
        add_compile_options(-pg)
    endif()
+    if (LLAMA_NATIVE)
+        add_compile_options(-march=native)
+    endif()
 endif()

 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@@ -564,9 +547,6 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
-        if (LLAMA_NATIVE)
-            add_compile_options(-march=native)
-        endif()
        if (LLAMA_F16C)
            add_compile_options(-mf16c)
        endif()
@@ -663,8 +643,6 @@ add_library(ggml OBJECT
            ggml.h
            ggml-alloc.c
            ggml-alloc.h
-            ggml-backend.c
-            ggml-backend.h
            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
@@ -726,7 +704,6 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
 set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
 set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
 set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
-get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)

 configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
--- a/192
+++ b/192
@@ -1,14 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = \
-	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server embd-input-test gguf llama-bench llava baby-llama beam-search  \
-	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o

 # Binaries only useful for tests
-TEST_TARGETS = \
-	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
-	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -25,20 +19,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

-ifeq '' '$(findstring clang,$(shell $(CC) --version))'
-	CC_IS_GCC=1
-	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
-else
-	CC_IS_CLANG=1
-	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
-		CC_IS_LLVM_CLANG=1
-	else
-		CC_IS_APPLE_CLANG=1
-	endif
-	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
-				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
-endif
-
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@@ -68,10 +48,8 @@ test: $(TEST_TARGETS)
 		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
 		else \
 			echo "Running test $$test_target..."; \
@@ -109,6 +87,9 @@ CC	:= riscv64-unknown-linux-gnu-gcc
 CXX	:= riscv64-unknown-linux-gnu-g++
 endif

+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
+
 #
 # Compile flags
 #
@@ -178,24 +159,6 @@ else
 	MK_CPPFLAGS += -DNDEBUG
 endif

-ifdef LLAMA_SANITIZE_THREAD
-	MK_CFLAGS   += -fsanitize=thread -g
-	MK_CXXFLAGS += -fsanitize=thread -g
-	MK_LDFLAGS  += -fsanitize=thread -g
-endif
-
-ifdef LLAMA_SANITIZE_ADDRESS
-	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
-	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
-	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
-endif
-
-ifdef LLAMA_SANITIZE_UNDEFINED
-	MK_CFLAGS   += -fsanitize=undefined -g
-	MK_CXXFLAGS += -fsanitize=undefined -g
-	MK_LDFLAGS  += -fsanitize=undefined -g
-endif
-
 ifdef LLAMA_SERVER_VERBOSE
 	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
@@ -210,33 +173,20 @@ ifdef LLAMA_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS

 # warnings
-WARN_FLAGS    = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
-				-Werror=implicit-function-declaration
-MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
+MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
+MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar

-ifeq ($(CC_IS_CLANG), 1)
-	# clang options
-	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
-	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
+# TODO(cebtenzzre): remove this once PR #2632 gets merged
+TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations

-	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
-		MK_CFLAGS += -Wdouble-promotion
-	endif
-	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
-		MK_CFLAGS += -Wdouble-promotion
-	endif
+ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
+	# clang++ only
+	MK_CXXFLAGS   += -Wmissing-prototypes
+	TTFS_CXXFLAGS += -Wno-missing-prototypes
 else
-	# gcc options
-	MK_CFLAGS        += -Wdouble-promotion
-	MK_HOST_CXXFLAGS += -Wno-array-bounds
-
-	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
-		MK_HOST_CXXFLAGS += -Wno-format-truncation
-	endif
-	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
-		MK_HOST_CXXFLAGS += -Wextra-semi
-	endif
+	# g++ only
+	MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
 endif

 # OS specific
@@ -432,7 +382,7 @@ ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
@@ -522,8 +472,8 @@ $(info I CFLAGS:    $(CFLAGS))
 $(info I CXXFLAGS:  $(CXXFLAGS))
 $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(shell $(CC) --version | head -n 1))
-$(info I CXX:       $(shell $(CXX) --version | head -n 1))
+$(info I CC:        $(CCV))
+$(info I CXX:       $(CXXV))
 $(info )

 #
@@ -536,21 +486,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+OBJS += ggml-alloc.o

-OBJS += ggml-alloc.o ggml-backend.o
-
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h
-COMMON_DEPS   = $(COMMON_H_DEPS) common.o sampling.o grammar-parser.o
-
-common.o: common/common.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
+common.o: common/common.cpp common/common.h build-info.h common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 console.o: common/console.cpp common/console.h
@@ -559,9 +500,6 @@ console.o: common/console.cpp common/console.h
 grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-train.o: common/train.cpp common/train.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

@@ -572,22 +510,16 @@ clean:
 # Examples
 #

-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
+batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
@@ -596,56 +528,47 @@ quantize: examples/quantize/quantize.cpp                      build-info.h ggml.
 quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

-$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)


-embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ifdef LLAMA_METAL
@@ -653,11 +576,6 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 endif

-ifeq ($(UNAME_S),Darwin)
-swift: examples/batched.swift
-	(cd examples/batched.swift; make build)
-endif
-
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@@ -674,52 +592,42 @@ tests: $(TEST_TARGETS)

 benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-run-benchmark-matmult: benchmark-matmult
 	./$@

-.PHONY: run-benchmark-matmult swift
-
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-
-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-c.o: tests/test-c.c llama.h
--- a/Package.swift
+++ b/Package.swift
@@ -1,27 +1,24 @@
-// swift-tools-version:5.5
+// swift-tools-version:5.3

 import PackageDescription

 #if arch(arm) || arch(arm64)
 let platforms: [SupportedPlatform]? = [
-    .macOS(.v12),
+    .macOS(.v11),
    .iOS(.v14),
    .watchOS(.v4),
    .tvOS(.v14)
 ]
 let exclude: [String] = []
-let resources: [Resource] = [
-    .process("ggml-metal.metal")
-]
 let additionalSources: [String] = ["ggml-metal.m"]
 let additionalSettings: [CSetting] = [
    .unsafeFlags(["-fno-objc-arc"]),
+    .define("GGML_SWIFT"),
    .define("GGML_USE_METAL")
 ]
 #else
 let platforms: [SupportedPlatform]? = nil
 let exclude: [String] = ["ggml-metal.metal"]
-let resources: [Resource] = []
 let additionalSources: [String] = []
 let additionalSettings: [CSetting] = []
 #endif
@@ -41,20 +38,15 @@ let package = Package(
                "ggml.c",
                "llama.cpp",
                "ggml-alloc.c",
-                "ggml-backend.c",
                "k_quants.c",
            ] + additionalSources,
-            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+                .unsafeFlags(["-Wno-shorten-64-to-32"]),
                .define("GGML_USE_K_QUANTS"),
                .define("GGML_USE_ACCELERATE")
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
+                .define("ACCELERATE_NEW_LAPACK")
+                .define("ACCELERATE_LAPACK_ILP64")
            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
--- a/README.md
+++ b/README.md
@@ -5,14 +5,13 @@
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggerganov/projects/7) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
- ‼️ Breaking change: `rope_freq_base` and `rope_freq_scale` must be set to zero to use the model default values: [#3401](https://github.com/ggerganov/llama.cpp/pull/3401)
- Parallel decoding + continuous batching support added: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
+
+- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
  **Devs should become familiar with the new API**
 - Local Falcon 180B inference on Mac Studio

@@ -89,17 +88,11 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
- [X] [Pygmalion/Metharme](#using-pygmalion-7b--metharme-7b)
+- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
-
+- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
+- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
+- [X] Mistral AI v0.1

 **Bindings:**

@@ -208,7 +201,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8

 ## Usage

-Here are the end-to-end binary build and model conversion steps for the LLaMA-7B model.
+Here are the steps for the LLaMA-7B model.

 ### Get the Code

@@ -281,7 +274,7 @@ In order to build llama.cpp you have three different options.
 On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
 To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.

-When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
+When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
 argument.

 ### MPI Build
@@ -382,7 +375,7 @@ Building the program with BLAS support may lead to some performance improvements

 - #### cuBLAS

-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  - Using `make`:
    ```bash
    make LLAMA_CUBLAS=1
@@ -575,18 +568,6 @@ python3 convert.py models/7B/

 When running the larger models, make sure you have enough disk space to store all the intermediate files.

-### Running on Windows with prebuilt binaries
-
-You will find prebuilt Windows binaries on the release page.
-
-Simply download and extract the latest zip package of choice: (e.g. `llama-b1380-bin-win-avx2-x64.zip`)
-
-From the unzipped folder, open a terminal/cmd window here and place a pre-converted `.gguf` model file. Test out the main example like so:
-
-```
-.\main -m llama-2-7b.Q4_0.gguf -n 128
-```
-
 ### Memory/Disk Requirements

 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
@@ -630,18 +611,6 @@ For more information, see [https://huggingface.co/docs/transformers/perplexity](
 The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
 The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.

-#### How to run
-
-1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
-3. Output:
-```
-perplexity : calculating perplexity over 655 chunks
-24.43 seconds per pass - ETA 4.45 hours
-[1]4.5970,[2]5.1807,[3]6.0382,...
-```
-And after 4.45 hours, you will have the final perplexity.
-
 ### Interactive mode

 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
@@ -693,8 +662,6 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \

 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).

-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
-
 ### Instruction mode with Alpaca

 1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -804,6 +771,18 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

+#### How to run
+
+1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
+3. Output:
+```
+perplexity : calculating perplexity over 655 chunks
+24.43 seconds per pass - ETA 4.45 hours
+[1]4.5970,[2]5.1807,[3]6.0382,...
+```
+And after 4.45 hours, you will have the final perplexity.
+
 ### Android

 #### Building the Project using Android NDK
--- a/build.zig
+++ b/build.zig
@@ -36,17 +36,14 @@ const Maker = struct {
    }

    fn init(builder: *std.build.Builder) !Maker {
+        // const commit_hash = @embedFile(".git/refs/heads/master");
        const target = builder.standardTargetOptions(.{});
-        const zig_version = @import("builtin").zig_version_string;
-        const commit_hash = try std.ChildProcess.exec(
-            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
-        );
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
            .{
                .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
-                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
+                .BUILD_COMMIT = "12345", // omit newline
+                .BUILD_COMPILER = "Zig 0.11.0",
                .BUILD_TARGET = try target.allocDescription(builder.allocator),
            },
        );
@@ -70,20 +67,12 @@ const Maker = struct {

    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        if (o.target.getAbi() != .msvc)
-            o.defineCMacro("_GNU_SOURCE", null);
-        o.addConfigHeader(m.config_header);
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
            o.addCSourceFiles(&.{src}, m.cxxflags.items);
-            if (o.target.getAbi() == .msvc) {
-                o.linkLibC(); // need winsdk + crt
-            } else {
-                // linkLibCpp already add (libc++ + libunwind + libc)
-                o.linkLibCpp();
-            }
+            o.linkLibCpp();
        }
        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
@@ -97,14 +86,8 @@ const Maker = struct {
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-
-        // https://github.com/ziglang/zig/issues/15448
-        if (e.target.getAbi() == .msvc) {
-            e.linkLibC(); // need winsdk + crt
-        } else {
-            // linkLibCpp already add (libc++ + libunwind + libc)
-            e.linkLibCpp();
-        }
+        e.linkLibC();
+        e.linkLibCpp();
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
@@ -124,22 +107,18 @@ pub fn build(b: *std.build.Builder) !void {

    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
-    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const llama = make.obj("llama", "llama.cpp");
    const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("console", "common/console.cpp");
-    const sampling = make.obj("sampling", "common/sampling.cpp");
+    const console = make.obj("common", "common/console.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
-    const train = make.obj("train", "common/train.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -208,8 +208,6 @@ function gg_run_open_llama_3b_v2 {
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -298,7 +296,6 @@ function gg_sum_open_llama_3b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -385,8 +382,6 @@ function gg_run_open_llama_7b_v2 {
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log

-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
@@ -475,7 +470,6 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
@@ -502,12 +496,10 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
-        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
+    if [ -z ${GG_BUILD_CUDA} ]; then
+        test $ret -eq 0 && gg_run open_llama_3b_v2
+    else
+        test $ret -eq 0 && gg_run open_llama_7b_v2
    fi
 fi

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -5,14 +5,10 @@ set(TARGET common)
 add_library(${TARGET} OBJECT
    common.h
    common.cpp
-    sampling.h
-    sampling.cpp
    console.h
    console.cpp
    grammar-parser.h
    grammar-parser.cpp
-    train.h
-    train.cpp
    )

 if (BUILD_SHARED_LIBS)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-void process_escapes(std::string& input) {
+static void process_escapes(std::string& input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;

@@ -107,7 +107,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
-    llama_sampling_params & sparams = params.sampling_params;

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
@@ -130,15 +129,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            if (params.n_threads <= 0) {
                params.n_threads = std::thread::hardware_concurrency();
            }
-        } else if (arg == "-tb" || arg == "--threads-batch") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-            if (params.n_threads_batch <= 0) {
-                params.n_threads_batch = std::thread::hardware_concurrency();
-            }
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_param = true;
@@ -168,10 +158,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            // store the external file name in params
-            params.prompt_file = argv[i];
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (!params.prompt.empty() && params.prompt.back() == '\n') {
+            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-n" || arg == "--n-predict") {
@@ -185,7 +173,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.top_k = std::stoi(argv[i]);
+            params.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx-size") {
            if (++i >= argc) {
                invalid_param = true;
@@ -217,73 +205,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            sparams.top_p = std::stof(argv[i]);
+            params.top_p = std::stof(argv[i]);
        } else if (arg == "--temp") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.temp = std::stof(argv[i]);
+            params.temp = std::stof(argv[i]);
        } else if (arg == "--tfs") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.tfs_z = std::stof(argv[i]);
+            params.tfs_z = std::stof(argv[i]);
        } else if (arg == "--typical") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.typical_p = std::stof(argv[i]);
+            params.typical_p = std::stof(argv[i]);
        } else if (arg == "--repeat-last-n") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.repeat_last_n = std::stoi(argv[i]);
+            params.repeat_last_n = std::stoi(argv[i]);
        } else if (arg == "--repeat-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.repeat_penalty = std::stof(argv[i]);
+            params.repeat_penalty = std::stof(argv[i]);
        } else if (arg == "--frequency-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.frequency_penalty = std::stof(argv[i]);
+            params.frequency_penalty = std::stof(argv[i]);
        } else if (arg == "--presence-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.presence_penalty = std::stof(argv[i]);
+            params.presence_penalty = std::stof(argv[i]);
        } else if (arg == "--mirostat") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.mirostat = std::stoi(argv[i]);
+            params.mirostat = std::stoi(argv[i]);
        } else if (arg == "--mirostat-lr") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.mirostat_eta = std::stof(argv[i]);
+            params.mirostat_eta = std::stof(argv[i]);
        } else if (arg == "--mirostat-ent") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.mirostat_tau = std::stof(argv[i]);
+            params.mirostat_tau = std::stof(argv[i]);
        } else if (arg == "--cfg-negative-prompt") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.cfg_negative_prompt = argv[i];
+            params.cfg_negative_prompt = argv[i];
        } else if (arg == "--cfg-negative-prompt-file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -295,16 +283,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
-            if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
-                sparams.cfg_negative_prompt.pop_back();
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            if (params.cfg_negative_prompt.back() == '\n') {
+                params.cfg_negative_prompt.pop_back();
            }
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            sparams.cfg_scale = std::stof(argv[i]);
+            params.cfg_scale = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
@@ -364,19 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
-            params.use_mmap = false;
-        } else if (arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
+            params.lora_adapter = argv[i];
            params.use_mmap = false;
        } else if (arg == "--lora-base") {
            if (++i >= argc) {
@@ -384,18 +360,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.lora_base = argv[i];
-        } else if (arg == "--mmproj") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mmproj = argv[i];
-        } else if (arg == "--image") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.image = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
@@ -404,8 +368,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
-        } else if (arg == "--infill") {
-            params.infill = true;
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@@ -477,6 +439,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.mul_mat_q = false;
 #else
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--low-vram" || arg == "-lv") {
+#ifdef GGML_USE_CUBLAS
+            params.low_vram = true;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
@@ -525,7 +493,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
-            sparams.penalize_nl = false;
+            params.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
            if (++i >= argc) {
                invalid_param = true;
@@ -537,7 +505,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            std::string value_str;
            try {
                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
                } else {
                    throw std::exception();
                }
@@ -631,17 +599,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
        process_escapes(params.input_suffix);
-        for (auto & antiprompt : params.antiprompt) {
-            process_escapes(antiprompt);
-        }
    }

    return true;
 }

 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    const llama_sampling_params & sparams = params.sampling_params;
-
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
    printf("options:\n");
@@ -655,9 +618,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        (can be specified more than once for multiple prompts).\n");
    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N\n");
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
+    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -p PROMPT, --prompt PROMPT\n");
    printf("                        prompt to start generation with (default: empty)\n");
    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@@ -672,21 +633,21 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
+    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
+    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
    printf("  --mirostat N          use Mirostat sampling.\n");
    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
+    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
    printf("                        modifies the likelihood of token appearing in the completion,\n");
    printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
@@ -697,7 +658,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        negative prompt to use for guidance. (default: empty)\n");
    printf("  --cfg-negative-prompt-file FNAME\n");
    printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
+    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
@@ -705,7 +666,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
+    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
@@ -715,8 +676,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
-    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
    if (llama_mlock_supported()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@@ -734,6 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ts SPLIT --tensor-split SPLIT\n");
    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
 #ifdef GGML_USE_CUBLAS
    printf("  -nommq, --no-mul-mat-q\n");
    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
@@ -743,7 +703,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
@@ -754,18 +713,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
 }

-std::string get_system_info(const gpt_params & params) {
-    std::ostringstream os;
-
-    os << "system_info: n_threads = " << params.n_threads;
-    if (params.n_threads_batch != -1) {
-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
-    }
-    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
-
-    return os.str();
-}
-
 std::string gpt_random_prompt(std::mt19937 & rng) {
    const int r = rng() % 10;
    switch (r) {
@@ -779,95 +726,60 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
+        default: return "To";
    }

-    GGML_UNREACHABLE();
+    return "The";
 }

 //
 // Model utils
 //

-struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
-    auto mparams = llama_model_default_params();
-
-    if (params.n_gpu_layers != -1) {
-        mparams.n_gpu_layers = params.n_gpu_layers;
-    }
-    mparams.main_gpu        = params.main_gpu;
-    mparams.tensor_split    = params.tensor_split;
-    mparams.use_mmap        = params.use_mmap;
-    mparams.use_mlock       = params.use_mlock;
-
-    return mparams;
-}
-
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
-    auto cparams = llama_context_default_params();
+    auto lparams = llama_context_default_params();

-    cparams.n_ctx           = params.n_ctx;
-    cparams.n_batch         = params.n_batch;
-    cparams.n_threads       = params.n_threads;
-    cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    cparams.mul_mat_q       = params.mul_mat_q;
-    cparams.seed            = params.seed;
-    cparams.f16_kv          = params.memory_f16;
-    cparams.logits_all      = params.logits_all;
-    cparams.embedding       = params.embedding;
-    cparams.rope_freq_base  = params.rope_freq_base;
-    cparams.rope_freq_scale = params.rope_freq_scale;
-
-    return cparams;
-}
-
-void llama_batch_clear(struct llama_batch & batch) {
-    batch.n_tokens = 0;
-}
-
-void llama_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
-                               bool   logits) {
-    batch.token   [batch.n_tokens] = id;
-    batch.pos     [batch.n_tokens] = pos,
-    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
-    for (size_t i = 0; i < seq_ids.size(); ++i) {
-        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
+    lparams.n_ctx           = params.n_ctx;
+    lparams.n_batch         = params.n_batch;
+    if (params.n_gpu_layers != -1) {
+        lparams.n_gpu_layers = params.n_gpu_layers;
    }
-    batch.logits  [batch.n_tokens] = logits;
+    lparams.main_gpu        = params.main_gpu;
+    lparams.tensor_split    = params.tensor_split;
+    lparams.low_vram        = params.low_vram;
+    lparams.mul_mat_q       = params.mul_mat_q;
+    lparams.seed            = params.seed;
+    lparams.f16_kv          = params.memory_f16;
+    lparams.use_mmap        = params.use_mmap;
+    lparams.use_mlock       = params.use_mlock;
+    lparams.logits_all      = params.logits_all;
+    lparams.embedding       = params.embedding;
+    lparams.rope_freq_base  = params.rope_freq_base;
+    lparams.rope_freq_scale = params.rope_freq_scale;

-    batch.n_tokens++;
+    return lparams;
 }

 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
-    auto mparams = llama_model_params_from_gpt_params(params);
+    auto lparams = llama_context_params_from_gpt_params(params);

-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
        return std::make_tuple(nullptr, nullptr);
    }

-    auto cparams = llama_context_params_from_gpt_params(params);
-
-    llama_context * lctx = llama_new_context_with_model(model, cparams);
+    llama_context * lctx = llama_new_context_with_model(model, lparams);
    if (lctx == NULL) {
        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
        llama_free_model(model);
        return std::make_tuple(nullptr, nullptr);
    }

-    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
-        const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
-        float lora_scale = std::get<1>(params.lora_adapter[i]);
+    if (!params.lora_adapter.empty()) {
        int err = llama_model_apply_lora_from_file(model,
-                                             lora_adapter.c_str(),
-                                             lora_scale,
-                                             ((i > 0) || params.lora_base.empty())
-                                                ? NULL
-                                                : params.lora_base.c_str(),
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
                                             params.n_threads);
        if (err != 0) {
            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
@@ -878,14 +790,14 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
    }

    if (params.ignore_eos) {
-        params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }

    {
        LOG("warming up the model with an empty run\n");

        std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
-        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0), params.n_threads);
        llama_kv_cache_tokens_rm(lctx, -1, -1);
        llama_reset_timings(lctx);
    }
@@ -898,25 +810,16 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 //

 std::vector<llama_token> llama_tokenize(
-  const struct llama_context * ctx,
+        struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
-    return llama_tokenize(llama_get_model(ctx), text, add_bos, special);
-}
-
-std::vector<llama_token> llama_tokenize(
-    const struct llama_model * model,
-           const std::string & text,
-                        bool   add_bos,
-                        bool   special) {
+                        bool   add_bos) {
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+    n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special);
+        int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -926,10 +829,10 @@ std::vector<llama_token> llama_tokenize(

 std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(ctx, token, result.data(), result.size());
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        int check = llama_token_to_piece(ctx, token, result.data(), result.size());
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -968,10 +871,129 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
        result += piece;
    }

-    // NOTE: the original tokenizer decodes bytes after collecting the pieces.
    return result;
 }

+//
+// Sampling utils
+//
+
+llama_token llama_sample_token(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_grammar * grammar,
+               const struct gpt_params & params,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+                                   int   idx) {
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(ctx);
+
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
+    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+    const float   repeat_penalty  = params.repeat_penalty;
+    const float   alpha_presence  = params.presence_penalty;
+    const float   alpha_frequency = params.frequency_penalty;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
+    const bool    penalize_nl     = params.penalize_nl;
+
+    llama_token id = 0;
+
+    float * logits = llama_get_logits_ith(ctx, idx);
+
+    // Apply params.logit_bias map
+    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+        logits[it->first] += it->second;
+    }
+
+    candidates.clear();
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+
+    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
+
+    if (ctx_guidance) {
+        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
+    }
+
+    // apply penalties
+    if (!last_tokens.empty()) {
+        const float nl_logit = logits[llama_token_nl(ctx)];
+        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
+
+        llama_sample_repetition_penalty(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
+                last_tokens.data() + last_tokens.size() - last_n_repeat,
+                last_n_repeat, alpha_frequency, alpha_presence);
+
+        if (!penalize_nl) {
+            for (size_t idx = 0; idx < cur_p.size; idx++) {
+                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
+                    cur_p.data[idx].logit = nl_logit;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (grammar != NULL) {
+        llama_sample_grammar(ctx, &cur_p, grammar);
+    }
+
+    if (temp <= 0) {
+        // Greedy sampling
+        id = llama_sample_token_greedy(ctx, &cur_p);
+    } else {
+        if (mirostat == 1) {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            const int mirostat_m = 100;
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+        } else if (mirostat == 2) {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            llama_sample_temp(ctx, &cur_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+        } else {
+            // Temperature sampling
+            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
+            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
+            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
+            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
+            llama_sample_temp(ctx, &cur_p, temp);
+
+            {
+                const int n_top = 10;
+                LOG("top %d candidates:\n", n_top);
+
+                for (int i = 0; i < n_top; i++) {
+                    const llama_token id = cur_p.data[i].id;
+                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
+                }
+            }
+
+            id = llama_sample_token(ctx, &cur_p);
+
+            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
+        }
+    }
+    // printf("`%d`", candidates_p.size);
+
+    if (grammar != NULL) {
+        llama_grammar_accept_token(ctx, grammar, id);
+    }
+
+    return id;
+}
+
 //
 // YAML utils
 //
@@ -1123,8 +1145,6 @@ std::string get_sortable_timestamp() {

 void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    const llama_sampling_params & sparams = params.sampling_params;
-
    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
@@ -1153,7 +1173,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 #endif // NDEBUG

    fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
+    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));

 #ifdef __OPTIMIZE__
    fprintf(stream, "optimize: true\n");
@@ -1171,21 +1191,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l

    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
+    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
+    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
+    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);

-    const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
-    const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
+    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
+    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");

    dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
@@ -1198,54 +1218,42 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

    fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : sparams.logit_bias) {
+    for (std::pair<llama_token, float> lb : params.logit_bias) {
        if (ignore_eos && lb.first == logit_bias_eos->first) {
            continue;
        }
        fprintf(stream, "  %d: %f", lb.first, lb.second);
    }

-    fprintf(stream, "lora:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
-        if (std::get<1>(la) != 1.0f) {
-            continue;
-        }
-        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
-    }
-    fprintf(stream, "lora_scaled:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
-        if (std::get<1>(la) == 1.0f) {
-            continue;
-        }
-        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
-    }
+    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
+    fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
+    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
+    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
+    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
+    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
+    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
+    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
+    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);

    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
@@ -1263,15 +1271,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
+    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);

    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);

-    fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
+    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
+    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
+    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
+    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@@ -4,8 +4,6 @@

 #include "llama.h"

-#include "sampling.h"
-
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"

@@ -38,7 +36,6 @@ int32_t get_num_physical_cores();
 struct gpt_params {
    uint32_t seed                           = -1;   // RNG seed
    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
@@ -51,18 +48,36 @@ struct gpt_params {
    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
+    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 0.0f; // RoPE base frequency
    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor

-    // // sampling parameters
-    struct llama_sampling_params sampling_params;
+    // sampling parameters
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
+    // Classifier-Free Guidance
+    // https://arxiv.org/abs/2306.17806
+    std::string cfg_negative_prompt;       // string to help guidance
+    float       cfg_scale         = 1.f;   // How strong is guidance

    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
-    std::string prompt_file       = "";  // store the external prompt file name
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
@@ -70,9 +85,8 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

-    // TODO: avoid tuple, use struct
-    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base  = "";                              // base model path for the lora adapter
+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base    = "";  // base model path for the lora adapter

    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -81,6 +95,7 @@ struct gpt_params {
    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

+    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
@@ -99,49 +114,27 @@ struct gpt_params {
    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
+    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool logits_all        = false; // return logits for all tokens in the batch
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
-    bool infill            = false; // use infill mode
-
-    // multimodal models (see examples/llava)
-    std::string mmproj = ""; // path to multimodal projector
-    std::string image = ""; // path to an image file
 };

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

-std::string get_system_info(const gpt_params & params);
-
 std::string gpt_random_prompt(std::mt19937 & rng);

-void process_escapes(std::string& input);
-
 //
 // Model utils
 //

-// TODO: avoid tuplue, use struct
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
-
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

-// Batch utils
-
-void llama_batch_clear(struct llama_batch & batch);
-
-void llama_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
-                               bool   logits);
-
 //
 // Vocab utils
 //
@@ -149,16 +142,9 @@ void llama_batch_add(
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
 std::vector<llama_token> llama_tokenize(
-  const struct llama_context * ctx,
+        struct llama_context * ctx,
           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
-
-std::vector<llama_token> llama_tokenize(
-    const struct llama_model * model,
-           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
+                        bool   add_bos);

 // tokenizes a token into a piece
 // should work similar to Python's `tokenizer.id_to_piece`
@@ -182,6 +168,36 @@ std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);

+//
+// Sampling utils
+//
+
+// this is a common sampling function used across the examples for convenience
+// it can serve as a starting point for implementing your own sampling function
+//
+// required:
+//  - ctx:    context to use for sampling
+//  - params: sampling parameters
+//
+// optional:
+//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
+//  - grammar:       grammar to use for sampling, ignore if NULL
+//  - last_tokens:   needed for repetition penalty, ignore if empty
+//  - idx:           sample from llama_get_logits_ith(ctx, idx)
+//
+// returns:
+//  - token:      sampled token
+//  - candidates: vector of candidate tokens
+//
+llama_token llama_sample_token(
+                  struct llama_context * ctx,
+                  struct llama_context * ctx_guidance,
+                  struct llama_grammar * grammar,
+               const struct gpt_params & params,
+        const std::vector<llama_token> & last_tokens,
+         std::vector<llama_token_data> & candidates,
+                                   int   idx = 0);
+
 //
 // YAML utils
 //
--- a/common/log.h
+++ b/common/log.h
@@ -225,31 +225,31 @@ enum LogTriState
 //  USE LOG() INSTEAD
 //
 #ifndef _MSC_VER
-    #define LOG_IMPL(str, ...)                                                                                      \
-    do {                                                                                                            \
+    #define LOG_IMPL(str, ...)                                                                                          \
+    {                                                                                                               \
        if (LOG_TARGET != nullptr)                                                                                  \
        {                                                                                                           \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                     \
        }                                                                                                           \
-    } while (0)
+    }
 #else
-    #define LOG_IMPL(str, ...)                                                                                           \
-    do {                                                                                                                 \
+    #define LOG_IMPL(str, ...)                                                                                               \
+    {                                                                                                                    \
        if (LOG_TARGET != nullptr)                                                                                       \
        {                                                                                                                \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TARGET);                                                                                          \
        }                                                                                                                \
-    } while (0)
+    }
 #endif

 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
 #ifndef _MSC_VER
-    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    do {                                                                                                                                \
+    #define LOG_TEE_IMPL(str, ...)                                                                                                          \
+    {                                                                                                                                   \
        if (LOG_TARGET != nullptr)                                                                                                      \
        {                                                                                                                               \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
@@ -260,10 +260,10 @@ enum LogTriState
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                     \
        }                                                                                                                               \
-    } while (0)
+    }
 #else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    do {                                                                                                                                     \
+    #define LOG_TEE_IMPL(str, ...)                                                                                                               \
+    {                                                                                                                                        \
        if (LOG_TARGET != nullptr)                                                                                                           \
        {                                                                                                                                    \
            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
@@ -274,7 +274,7 @@ enum LogTriState
            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
            fflush(LOG_TEE_TARGET);                                                                                                          \
        }                                                                                                                                    \
-    } while (0)
+    }
 #endif

 // The '\0' as a last argument, is a trick to bypass the silly
@@ -435,41 +435,41 @@ inline FILE *log_handler() { return log_handler1_impl(); }
 inline void log_test()
 {
    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n");
+    LOG("01 Hello World to nobody, because logs are disabled!\n")
    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
+    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
+    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
+    LOG("04 Hello World to stderr!\n")
+    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n");
+    LOG("06 Hello World to default log file!\n")
    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n");
+    LOG("07 Hello World to stdout!\n")
    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n");
+    LOG("08 Hello World to default log file again!\n")
    log_disable();
-    LOG("09 Hello World _1_ into the void!\n");
+    LOG("09 Hello World _1_ into the void!\n")
    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
+    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
    log_disable();
    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
+    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
+    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n");
+    LOG("13 Hello World this time in yet new file?\n")
    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n");
+    LOG("14 Hello World in log with generated filename!\n")
 #ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
+    LOG_TEE("15 Hello msvc TEE without arguments\n")
+    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
+    LOG_TEELN("17 Hello msvc TEELN without arguments\n")
+    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
+    LOG("19 Hello msvc LOG without arguments\n")
+    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
+    LOGLN("21 Hello msvc LOGLN without arguments\n")
+    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
 #endif
 }

@@ -542,7 +542,7 @@ inline void log_dump_cmdline_impl(int argc, char **argv)
            buf << " " << argv[i];
        }
    }
-    LOGLN("Cmd:%s", buf.str().c_str());
+    LOGLN("Cmd:%s", buf.str().c_str())
 }

 #define log_tostr(var) log_var_to_string_impl(var).c_str()
@@ -579,75 +579,38 @@ inline std::string log_var_to_string_impl(const std::vector<int> & var)
    return buf.str();
 }

-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto &token : tokens)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
+#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
+    [&tokens, &ctx]()                                                        \
+    {                                                                        \
+        std::stringstream buf;                                               \
+        buf << "[ ";                                                         \
+                                                                             \
+        bool first = true;                                                   \
+        for (const auto &token : tokens)                                     \
+        {                                                                    \
+            if (!first)                                                      \
+                buf << ", ";                                                 \
+            else                                                             \
+                first = false;                                               \
+                                                                             \
+            auto detokenized = llama_token_to_piece(ctx, token);             \
+                                                                             \
+            detokenized.erase(                                               \
+                std::remove_if(                                              \
+                    detokenized.begin(),                                     \
+                    detokenized.end(),                                       \
+                    [](const unsigned char c) { return !std::isprint(c); }), \
+                detokenized.end());                                          \
+                                                                             \
+            buf                                                              \
+                << "'" << detokenized << "'"                                 \
+                << ":" << std::to_string(token);                             \
+        }                                                                    \
+        buf << " ]";                                                         \
+                                                                             \
+        return buf.str();                                                    \
+    }()                                                                      \
+        .c_str()

 #ifdef LOG_DISABLE_LOGS

@@ -657,10 +620,10 @@ inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
 #define LOGLN(...) // dummy stub

 #undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf

 #undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
+#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf

 #undef LOG_DISABLE
 #define LOG_DISABLE() // dummy stub
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,193 +0,0 @@
-#include "sampling.h"
-
-struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
-
-    result->params = params.sampling_params;
-    result->grammar = nullptr;
-
-    // if there is a grammar, parse it
-    if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-
-        // will be empty (default) if there are parse errors
-        if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
-            return nullptr;
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-
-        result->grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
-    }
-
-    result->prev.resize(params.n_ctx);
-
-    return result;
-}
-
-void llama_sampling_free(struct llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-    }
-
-    delete ctx;
-}
-
-void llama_sampling_reset(llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-    }
-
-    if (!ctx->parsed_grammar.rules.empty()) {
-        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-
-        ctx->grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
-    }
-
-    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
-    ctx->cur.clear();
-}
-
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-    if (dst->grammar) {
-        llama_grammar_free(dst->grammar);
-        dst->grammar = nullptr;
-    }
-
-    if (src->grammar) {
-        dst->grammar = llama_grammar_copy(src->grammar);
-    }
-
-    dst->prev = src->prev;
-}
-
-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    const int n_ctx   = llama_n_ctx(ctx_main);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
-    const float   alpha_presence  = params.presence_penalty;
-    const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    llama_token id = 0;
-
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    // Apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    cur.clear();
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    if (ctx_cfg) {
-        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
-    }
-
-    // apply penalties
-    if (!prev.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx_main)];
-        const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
-
-        llama_sample_repetition_penalty(ctx_main, &cur_p,
-                prev.data() + prev.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
-                prev.data() + prev.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
-
-    if (temp <= 0) {
-        // Greedy sampling
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
-        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
-        } else {
-            // Temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
-            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
-            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
-            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
-            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
-            llama_sample_temp     (ctx_main, &cur_p, temp);
-
-            id = llama_sample_token(ctx_main, &cur_p);
-
-            //{
-            //    const int n_top = 10;
-            //    LOG("top %d candidates:\n", n_top);
-
-            //    for (int i = 0; i < n_top; i++) {
-            //        const llama_token id = cur_p.data[i].id;
-            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
-            //    }
-            //}
-
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
-        }
-    }
-
-    return id;
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id) {
-    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-    ctx_sampling->prev.push_back(id);
-
-    if (ctx_sampling->grammar != NULL) {
-        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
-    }
-}
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -1,99 +0,0 @@
-#pragma once
-
-#include "llama.h"
-
-#include "grammar-parser.h"
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-// sampling parameters
-typedef struct llama_sampling_params {
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-
-    bool    penalize_nl       = true;  // consider newlines as a repeatable token
-
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt;   // string to help guidance
-    float       cfg_scale     = 1.f;   // How strong is guidance
-
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-} llama_sampling_params;
-
-// general sampler context
-// TODO: move to llama.h
-struct llama_sampling_context {
-    // parameters that will be used for sampling
-    llama_sampling_params params;
-
-    // mirostat sampler state
-    float mirostat_mu;
-
-    llama_grammar * grammar;
-
-    // internal
-    grammar_parser::parse_state parsed_grammar;
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token>      prev;
-    std::vector<llama_token_data> cur;
-};
-
-#include "common.h"
-
-// Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
-
-void llama_sampling_free(struct llama_sampling_context * ctx);
-
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
-// Copy the sampler context
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
-//
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
-//
-// optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
-//
-llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0);
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id);
--- a/common/stb_image.h
+++ b/common/stb_image.h
--- a/common/train.cpp
+++ b/common/train.cpp
--- a/common/train.h
+++ b/common/train.h
@@ -1,230 +0,0 @@
-// Various helper functions and utilities for training
-
-#pragma once
-
-#include <string>
-#include <random>
-#include <vector>
-
-#include "ggml.h"
-#include "llama.h"
-
-typedef std::string mt19937_state;
-
-struct train_state {
-    struct ggml_opt_context * opt;
-
-    uint64_t train_its;
-    uint64_t train_samples;
-    uint64_t train_tokens;
-    uint64_t train_epochs;
-
-    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    mt19937_state shuffle_rng_state_current;
-    mt19937_state shuffle_rng_state_next;
-    size_t        shuffle_sample_count;
-    size_t        shuffle_next_sample;
-};
-
-struct train_params_common {
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    bool print_usage;
-
-    int save_every;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_threads;
-    int n_batch;
-    int n_gradient_accumulation;
-    int n_epochs;
-
-    bool custom_n_ctx;
-
-    bool use_flash;
-    bool use_checkpointing;
-
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-    bool sample_random_offsets;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
-};
-
-typedef void (*save_train_files_callback)(void * data, struct train_state * train);
-
-struct train_opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_offs;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int                          first_epoch;
-    int                          iter_at_last_epoch;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-struct train_state * init_train_state();
-void free_train_state(struct train_state  * state);
-
-struct train_params_common get_default_train_params_common();
-void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
-
-bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
-void finish_processing_train_args(struct train_params_common * params);
-
-struct random_normal_distribution;
-struct random_uniform_distribution;
-
-struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd);
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
-
-struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
-
-// generate random float in interval [0,1)
-float frand();
-float frand_normal (struct random_normal_distribution * rnd);
-float frand_uniform(struct random_uniform_distribution * rnd);
-
-int   clamp (const int v, const int min, const int max);
-float fclamp(const float v, const float min, const float max);
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size);
-
-int64_t get_example_targets_batch(
-        struct llama_context * lctx,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        int64_t                example_id,
-        const size_t         * samples_offs,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples,
-        bool                   sample_random_offsets);
-
-
-void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
-mt19937_state mt19937_get_state(const std::mt19937& rng);
-mt19937_state mt19937_seed_to_state(unsigned seed);
-
-mt19937_state shuffle_samples(
-        const mt19937_state & rng_state,
-        size_t              * shuffled_offs,
-        size_t              * shuffled_begins,
-        size_t              * shuffled_sizes,
-        const size_t        * begins,
-        const size_t        * sizes,
-        size_t                count);
-
-size_t hash_combine(size_t h1, size_t h2);
-
-size_t compute_samples_hash(
-    const char* fn,
-    const size_t* samples_begin,
-    const size_t* samples_size,
-    size_t sample_count);
-
-
-std::string replace_str(const char * s, const char * needle, const char * replacement);
-
-void print_duration(double milliseconds);
-
-float cosine_decay(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum);
-
-float cosine_decay_restart(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum,
-    float   restart_step_mult);
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart);
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -11,14 +11,11 @@ import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 import itertools
+import gguf
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]

-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-

 if TYPE_CHECKING:
    from typing import TypeAlias
@@ -177,11 +174,8 @@ if not tokenizer_model_file.is_file():
 print("gguf: get sentencepiece tokenizer vocab, scores and token types")

 tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
-vocab_size = hparams.get('vocab_size')
-if vocab_size is None:
-    vocab_size = tokenizer.vocab_size()

-for i in range(vocab_size):
+for i in range(tokenizer.vocab_size()):
    text: bytes
    score: float

--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@@ -1,238 +0,0 @@
-#!/usr/bin/env python3
-# HF bloom --> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import re
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-# Supported Models:
-#   https://huggingface.co/bigscience/bloom-1b7
-#   https://huggingface.co/bigscience/bloom-3b
-#   https://huggingface.co/bigscience/bloom-7b1
-#   https://huggingface.co/Langboat/bloom-1b4-zh
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
-    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
-    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "BloomForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.BLOOM
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["n_layer"]
-
-gguf_writer.add_name("Bloom")
-n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
-n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
-gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
-gguf_writer.add_embedding_length(n_embed)
-gguf_writer.add_feed_forward_length(4 * n_embed)
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(n_head)
-gguf_writer.add_head_count_kv(n_head)
-gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0)  # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
-
-# params for qkv transform
-n_head_kv = hparams.get("n_head_kv", n_head)
-head_dim = n_embed // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    has_lm_head = True
-    if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
-        has_lm_head = False
-
-    for original_name in model_part.keys():
-        data = model_part[original_name]
-        name = re.sub(r'transformer\.', '', original_name)
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
-            # Map bloom-style qkv_linear to gpt-style qkv_linear
-            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
-            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
-            qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
-            data = np.concatenate(
-                (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
-                 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
-                 qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
-                axis=0
-            )
-            print("re-format attention.linear_qkv.weight")
-        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
-            qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
-            data = np.concatenate(
-                (qkv_bias[:, 0, :].reshape((n_embed,)),
-                 qkv_bias[:, 1, :].reshape((n_embed,)),
-                 qkv_bias[:, 2, :].reshape((n_embed,))),
-                axis=0
-            )
-            print("re-format attention.linear_qkv.bias")
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-        if not has_lm_head and name == "word_embeddings.weight":
-            gguf_writer.add_tensor("output.weight", data)
-            print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  # noqa
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -4,7 +4,6 @@
 from __future__ import annotations

 import argparse
-import contextlib
 import json
 import os
 import struct
@@ -21,10 +20,32 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf


-def count_model_parts(dir_model: Path, prefix: str) -> int:
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
+
+
+def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
-        if filename.startswith(prefix):
+        if filename.startswith("pytorch_model-"):
            num_parts += 1

    if num_parts > 0:
@@ -78,26 +99,20 @@ print("gguf: loading model "+dir_model.name)
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

-if hparams["architectures"][0] != "FalconForCausalLM":
+if hparams["architectures"][0] != "RWForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])

    sys.exit(1)

 # get number of model parts
-num_parts = count_model_parts(dir_model, "model-00")
-if num_parts:
-    is_safetensors = True
-    from safetensors import safe_open
-else:
-    is_safetensors = False
-    num_parts = count_model_parts(dir_model, "pytorch_model-")
+num_parts = count_model_parts(dir_model)

 ARCH=gguf.MODEL_ARCH.FALCON
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

 print("gguf: get model metadata")

-block_count = hparams["num_hidden_layers"]
+block_count = hparams["n_layer"]

 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
@@ -105,9 +120,9 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["num_attention_heads"])
-if "num_kv_heads" in hparams:
-    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
+gguf_writer.add_head_count(hparams["n_head"])
+if "n_head_kv" in hparams:
+    gguf_writer.add_head_count_kv(hparams["n_head_kv"])
 else:
    gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
@@ -121,25 +136,49 @@ tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []

+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+    sys.exit(1)
+
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")

+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)
+
 print("gguf: get gpt2 tokenizer vocab")

+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
+
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)

-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}

 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i])
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+
+    tokens.append(text)
+    scores.append(0.0)                      # dymmy
+    toktypes.append(gguf.TokenType.NORMAL)  # dummy

 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
@@ -153,8 +192,8 @@ special_vocab.add_to_gguf(gguf_writer)
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)

 # params for qkv transform
-n_head    = hparams["num_attention_heads"]
-n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
+n_head    = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1

 head_dim = hparams["hidden_size"] // n_head

@@ -163,10 +202,6 @@ print("gguf: get tensor metadata")

 if num_parts == 0:
    part_names = iter(("pytorch_model.bin",))
-elif is_safetensors:
-    part_names = (
-        f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
-    )
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
@@ -176,64 +211,60 @@ for part_name in part_names:
    if args.vocab_only:
        break
    print("gguf: loading model part '" + part_name + "'")
-    if is_safetensors:
-        ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
-    else:
-        ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
+    model_part = torch.load(dir_model / part_name, map_location="cpu")

-    with ctx as model_part:
-        for name in model_part.keys():
-            data = model_part.get_tensor(name) if is_safetensors else model_part[name]
+    for name in model_part.keys():
+        data = model_part[name]

-            old_dtype = data.dtype
+        old_dtype = data.dtype

-            # convert any unsupported data types to float32
-            if data.dtype != torch.float16 and data.dtype != torch.float32:
-                data = data.to(torch.float32)
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)

-            # QKV tensor transform
-            # The original query_key_value tensor contains n_head_kv "kv groups",
-            # each consisting of n_head/n_head_kv query weights followed by one key
-            # and one value weight (shared by all query heads in the kv group).
-            # This layout makes it a big pain to work with in GGML.
-            # So we rearrange them here,, so that we have n_head query weights
-            # followed by n_head_kv key weights followed by n_head_kv value weights,
-            # in contiguous fashion.
-            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+        # QKV tensor transform
+        # The original query_key_value tensor contains n_head_kv "kv groups",
+        # each consisting of n_head/n_head_kv query weights followed by one key
+        # and one value weight (shared by all query heads in the kv group).
+        # This layout makes it a big pain to work with in GGML.
+        # So we rearrange them here,, so that we have n_head query weights
+        # followed by n_head_kv key weights followed by n_head_kv value weights,
+        # in contiguous fashion.
+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py

-            if "query_key_value" in name:
-                qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
-                q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
-                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
-                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
-                data = torch.cat((q,k,v)).reshape_as(data)
+        if "query_key_value" in name:
+            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            data = torch.cat((q,k,v)).reshape_as(data)

-            data = data.squeeze().numpy()
+        data = data.squeeze().numpy()

-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-            if new_name is None:
-                print("Can not map tensor '" + name + "'")
-                sys.exit()
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()

-            n_dims = len(data.shape)
-            data_dtype = data.dtype
+        n_dims = len(data.shape)
+        data_dtype = data.dtype

-            # if f32 desired, convert any float16 to float32
-            if ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)

-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)

-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)

-            print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-            gguf_writer.add_tensor(new_name, data)
+        gguf_writer.add_tensor(new_name, data)


 print("gguf: write header")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -19,6 +19,29 @@ if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
+

 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
@@ -107,32 +130,48 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
 print("gguf: get tokenizer metadata")

 tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
+
+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+    sys.exit(1)

 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")

+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)
+
 print("gguf: get gpt2 tokenizer vocab")

+vocab_size = len(tokenizer_json["model"]["vocab"])
+
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)

-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}

 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+
+    tokens.append(text)

 gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)

 special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -1,218 +0,0 @@
-#!/usr/bin/env python3
-# HF mpt--> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "MPTForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit()
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.MPT
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["n_layers"]
-
-gguf_writer.add_name(dir_model.name)
-gguf_writer.add_context_length(hparams["max_seq_len"])
-gguf_writer.add_embedding_length(hparams["d_model"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
-gguf_writer.add_head_count(hparams["n_heads"])
-if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
-    gguf_writer.add_head_count_kv(kv_n_heads)
-gguf_writer.add_layer_norm_eps(1e-05)
-if hparams["attn_config"]["clip_qkv"] is not None:
-    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
-gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
-# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
-# accomodate some "reserved" tokens; this is causing problems down the line in
-# llama.cpp, so we pad the vocab with dummy tokens:
-
-vocab_size = hparams["vocab_size"]
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Cannot map tensor '" + name + "'")
-            continue # for the sake of compatibility with some old published models, don't quit
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-        # note: MPT output is tied to (same as) wte in original model;
-        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-        if new_name == "token_embd.weight":
-            gguf_writer.add_tensor("output.weight", data)
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,130 +0,0 @@
-import torch
-import os
-from pprint import pprint
-import sys
-import argparse
-from pathlib import Path
-from sentencepiece import SentencePieceProcessor
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-def _flatten_dict(dct, tensors, prefix=None):
-    assert isinstance(dct, dict)
-    for key in dct.keys():
-        new_prefix = prefix + '.' + key if prefix is not None else key
-        if isinstance(dct[key], torch.Tensor):
-            tensors[new_prefix] = dct[key]
-        elif isinstance(dct[key], dict):
-            _flatten_dict(dct[key], tensors, new_prefix)
-        else:
-            raise ValueError(type(dct[key]))
-    return None
-
-def _get_sentencepiece_tokenizer_info(dir_model: Path):
-    tokenizer_path = dir_model / 'adept_vocab.model'
-    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
-    tokenizer = SentencePieceProcessor(str(tokenizer_path))
-    print('gguf: adding tokens')
-    tokens: list[bytes] = []
-    scores: list[float] = []
-    toktypes: list[int] = []
-
-    for i in range(tokenizer.vocab_size()):
-        text: bytes
-        score: float
-
-        piece = tokenizer.id_to_piece(i)
-        text = piece.encode("utf-8")
-        score = tokenizer.get_score(i)
-
-        toktype = 1
-        if tokenizer.is_unknown(i):
-            toktype = 2
-        if tokenizer.is_control(i):
-            toktype = 3
-        if tokenizer.is_unused(i):
-            toktype = 5
-        if tokenizer.is_byte(i):
-            toktype = 6
-
-        tokens.append(text)
-        scores.append(score)
-        toktypes.append(toktype)
-        pass
-    return tokens, scores, toktypes
-
-def main():
-    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
-    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
-    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
-    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
-    args = parser.parse_args()
-    sys.path.append(str(args.adept_inference_dir))
-    persimmon_model = torch.load(args.ckpt_path)
-    hparams = persimmon_model['args']
-    pprint(hparams)
-    tensors = {}
-    _flatten_dict(persimmon_model['model'], tensors, None)
-
-    arch = gguf.MODEL_ARCH.PERSIMMON
-    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
-
-    block_count = hparams.num_layers
-    head_count = hparams.num_attention_heads
-    head_count_kv = head_count
-    ctx_length = hparams.seq_length
-    hidden_size = hparams.hidden_size
-
-    gguf_writer.add_name('persimmon-8b-chat')
-    gguf_writer.add_context_length(ctx_length)
-    gguf_writer.add_embedding_length(hidden_size)
-    gguf_writer.add_block_count(block_count)
-    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
-    gguf_writer.add_rope_dimension_count(hidden_size // head_count)
-    gguf_writer.add_head_count(head_count)
-    gguf_writer.add_head_count_kv(head_count_kv)
-    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
-    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
-
-    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
-    gguf_writer.add_tokenizer_model('llama')
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_scores(scores)
-    gguf_writer.add_token_types(toktypes)
-    gguf_writer.add_bos_token_id(71013)
-    gguf_writer.add_eos_token_id(71013)
-
-    tensor_map = gguf.get_tensor_name_map(arch, block_count)
-    print(tensor_map)
-    for name in tensors.keys():
-        data = tensors[name]
-        if name.endswith(".self_attention.rotary_emb.inv_freq"):
-            continue
-        old_dtype = data.dtype
-        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-        data = data.to(torch.float32).squeeze().numpy()
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-        n_dims = len(data.shape)
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-        gguf_writer.add_tensor(new_name, data)
-    print("gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-    gguf_writer.close()
-
-    print(f"gguf: model successfully exported to '{args.outfile}'")
-    print("")
-
-
-
-if __name__ == '__main__':
-    main()
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@@ -1,263 +0,0 @@
-#!/usr/bin/env python3
-# HF refact--> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-from pathlib import Path
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if "NO_LOCAL_GGUF" not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
-import gguf
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a Refact model to a GGML compatible file"
-    )
-    parser.add_argument(
-        "--vocab-only",
-        action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile",
-        type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model",
-        type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype",
-        type=int,
-        choices=[0, 1],
-        default=1,
-        nargs="?",
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f"Error: {args.model} is not a directory", file=sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
-
-print("gguf: loading model " + dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "GPTRefactForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH = gguf.MODEL_ARCH.REFACT
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-# Get refact feed forward dimension
-hidden_dim = hparams["n_embd"]
-inner_dim = 4 * hidden_dim
-hidden_dim = int(2 * inner_dim / 3)
-multiple_of = 256
-ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-block_count = hparams["n_layer"]
-
-gguf_writer.add_name("Refact")
-# refact uses Alibi. So this is from config.json which might be used by training.
-gguf_writer.add_context_length(hparams["n_positions"])
-gguf_writer.add_embedding_length(hparams["n_embd"])
-
-gguf_writer.add_feed_forward_length(ff_dim)
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["n_head"])
-gguf_writer.add_head_count_kv(1)
-gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
-
-# params for qkv transform
-n_head = hparams["n_head"]
-n_head_kv = 1
-
-head_dim = hparams["n_embd"] // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    for i in range(block_count):
-        if f"transformer.h.{i}.attn.kv.weight" in model_part:
-            data = model_part[f"transformer.h.{i}.attn.kv.weight"]
-            model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
-                : n_head_kv * head_dim
-            ]
-            model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
-                n_head_kv * head_dim :
-            ]
-            del model_part[f"transformer.h.{i}.attn.kv.weight"]
-        if f"transformer.h.{i}.attn.q.weight" in model_part:
-            model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
-                f"transformer.h.{i}.attn.q.weight"
-            ]
-            del model_part[f"transformer.h.{i}.attn.q.weight"]
-        if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
-            data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
-            model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
-            model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
-            del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if (
-            ftype == 1
-            and data_dtype == np.float32
-            and name.endswith(".weight")
-            and n_dims == 2
-        ):
-            data = data.astype(np.float16)
-
-        print(
-            new_name
-            + ", n_dims = "
-            + str(n_dims)
-            + ", "
-            + str(old_dtype)
-            + " --> "
-            + str(data.dtype)
-        )
-
-        gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -20,6 +20,28 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf


+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    return dict(zip(bs, (chr(n) for n in cs)))
+
+
 def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
@@ -98,25 +120,49 @@ tokens: list[bytearray] = []
 scores: list[float] = []
 toktypes: list[int] = []

+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+    sys.exit(1)
+
 # gpt2 tokenizer
 gguf_writer.add_tokenizer_model("gpt2")

+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)
+
 print("gguf: get gpt2 tokenizer vocab")

+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
+
 # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
 tokenizer = AutoTokenizer.from_pretrained(dir_model)

-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}

 for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+
+    tokens.append(text)
+    scores.append(0.0)                      # dymmy
+    toktypes.append(gguf.TokenType.NORMAL)  # dummy

 gguf_writer.add_token_list(tokens)
 gguf_writer.add_token_scores(scores)
--- a/convert.py
+++ b/convert.py
@@ -41,7 +41,8 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):

 NDArray: TypeAlias = 'np.ndarray[Any, Any]'

-ARCH = gguf.MODEL_ARCH.LLAMA
+ARCH=gguf.MODEL_ARCH.LLAMA
+NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]

 DEFAULT_CONCURRENCY = 8
 #
@@ -338,15 +339,29 @@ class BpeVocab:
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.bpe_tokenizer
        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
-
-        for i, _ in enumerate(tokenizer):
-            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
+        byte_encoder = tokenization_gpt2.bytes_to_unicode()
+        byte_decoder = {v: k for k, v in byte_encoder.items()}
+        score = 0.0
+        for i, item in enumerate(tokenizer):
+            text: bytes = item.encode("utf-8")
+            # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
+            if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
+                if i == 0 and text == b'<unk>':
+                    toktype = gguf.TokenType.UNKNOWN
+                elif i == 1 or i == 2:
+                    toktype = gguf.TokenType.CONTROL
+                elif i >= 3 and text.startswith(b'<0x'):
+                    toktype = gguf.TokenType.BYTE
+                else:
+                    toktype = gguf.TokenType.NORMAL
+            else:
+                toktype = gguf.TokenType.NORMAL
+            yield text, score, toktype

    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
-            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
@@ -938,7 +953,7 @@ class OutputFile:
        of.close()

 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
+    wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type

    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -48,8 +48,8 @@ make -j
 According to the BLIS documentation, we could set the following
 environment variables to modify the behavior of openmp:

-```bash
-export GOMP_CPU_AFFINITY="0-19"
+```
+export GOMP_GPU_AFFINITY="0-19"
 export BLIS_NUM_THREADS=14
 ```

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,15 +21,12 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(finetune)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(batched)
-    add_subdirectory(batched-bench)
    add_subdirectory(speculative)
    add_subdirectory(parallel)
    add_subdirectory(embd-input)
-    add_subdirectory(llava)
    add_subdirectory(llama-bench)
    add_subdirectory(beam-search)
    if (LLAMA_METAL)
@@ -38,5 +35,4 @@ else()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
-    add_subdirectory(export-lora)
 endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1,12 +1,8 @@
 #include "ggml.h"
-#include "train.h"
-
 #include <vector>
 #include <cassert>
-#include <cstdlib>
-#include <cstring>
 #include <random>
-#include <vector>
+#include <cstring>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -18,6 +14,31 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 constexpr float rms_norm_eps = 5e-6f;
 #endif

+static float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+struct random_normal_distribution {
+    std::mt19937 gen;
+    std::normal_distribution<float> nd;
+    float min;
+    float max;
+};
+
+static void init_random_normal_distribution(
+    struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
+) {
+    rnd->gen = std::mt19937(seed);
+    rnd->nd = std::normal_distribution<float>{mean, std};
+    rnd->min = min;
+    rnd->max = max;
+}
+
+static float frand_normal(struct random_normal_distribution * rnd) {
+    const float r = rnd->nd(rnd->gen);
+    return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
+}
+
 static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

@@ -67,7 +88,55 @@ static struct ggml_tensor * randomize_tensor(
            break;
        default:
            assert(false);
-    }
+    };
+
+    return tensor;
+}
+
+static struct ggml_tensor * randomize_tensor_normal(
+    struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
+) {
+    float scale = 1.0; // xavier
+    switch (ndims) {
+        case 1:
+            scale /= sqrtf(ne[0]);
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
+            }
+            break;
+        case 2:
+            scale /= sqrtf(ne[0]+ne[1]);
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
+                }
+            }
+            break;
+        case 3:
+            scale /= sqrtf(ne[0]+ne[1]);
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
+                    }
+                }
+            }
+            break;
+        case 4:
+            scale /= sqrtf(ne[0]+ne[1]);
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };

    return tensor;
 }
@@ -329,29 +398,27 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl

    const uint32_t n_layer = hparams.n_layer;

-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings , rnd);
-    randomize_tensor_normal(model->norm           , rnd);
-    randomize_tensor_normal(model->output         , rnd);
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
+    randomize_tensor_normal(model->output,         model->output->n_dims,         model->output->ne,         &rnd);

    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
+        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);

-        randomize_tensor_normal(layer.wq, rnd);
-        randomize_tensor_normal(layer.wk, rnd);
-        randomize_tensor_normal(layer.wv, rnd);
-        randomize_tensor_normal(layer.wo, rnd);
+        randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
+        randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
+        randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
+        randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);

-        randomize_tensor_normal(layer.ffn_norm, rnd);
+        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);

-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
+        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
    }
-
-    free_random_normal_distribution(rnd);
 }


@@ -362,37 +429,35 @@ static void randomize_model_lora(

    const uint32_t n_layer = hparams.n_layer;

-    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
-
-    randomize_tensor_normal(model->tok_embeddings, rnd);
-    randomize_tensor_normal(model->norm          , rnd);
-    randomize_tensor_normal(model->outputa       , rnd);
-    randomize_tensor_normal(model->outputb       , rnd);
+    struct random_normal_distribution rnd;
+    init_random_normal_distribution(&rnd, seed, mean, std, min, max);
+    randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
+    randomize_tensor_normal(model->norm,           model->norm->n_dims,           model->norm->ne,           &rnd);
+    randomize_tensor_normal(model->outputa,        model->outputa->n_dims,        model->outputa->ne,         &rnd);
+    randomize_tensor_normal(model->outputb,        model->outputb->n_dims,        model->outputb->ne,         &rnd);

    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = model->layers[i];
-        randomize_tensor_normal(layer.attention_norm, rnd);
+        randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);

-        randomize_tensor_normal(layer.wqa, rnd);
-        randomize_tensor_normal(layer.wqb, rnd);
-        randomize_tensor_normal(layer.wka, rnd);
-        randomize_tensor_normal(layer.wkb, rnd);
-        randomize_tensor_normal(layer.wva, rnd);
-        randomize_tensor_normal(layer.wvb, rnd);
-        randomize_tensor_normal(layer.woa, rnd);
-        randomize_tensor_normal(layer.wob, rnd);
+        randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
+        randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
+        randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
+        randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
+        randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
+        randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
+        randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
+        randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);

-        randomize_tensor_normal(layer.ffn_norm, rnd);
+        randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);

-        randomize_tensor_normal(layer.w1, rnd);
-        randomize_tensor_normal(layer.w2, rnd);
-        randomize_tensor_normal(layer.w3, rnd);
+        randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
+        randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
+        randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
    }
-
-    free_random_normal_distribution(rnd);
 }

-static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
    const auto & hparams = model->hparams;

    const uint32_t n_ctx   = hparams.n_ctx;
@@ -418,12 +483,14 @@ static void init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod

        if (!cache->ctx) {
            fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-            exit(1);
+            return false;
        }
    }

    cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
    cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements);
+
+    return true;
 }

 static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
@@ -695,6 +762,32 @@ static struct ggml_tensor * forward(
    return inpL;
 }

+static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+}
+
+static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+}
+
+static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+}
+
+static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
+    GGML_ASSERT(tensor->ne[0] == ne0);
+    GGML_ASSERT(tensor->ne[1] == ne1);
+    GGML_ASSERT(tensor->ne[2] == ne2);
+    GGML_ASSERT(tensor->ne[3] == ne3);
+}
+
 static struct ggml_tensor * forward_batch(
    struct llama_model    * model,
    struct llama_kv_cache * cache,
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET batched-bench)
-add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -1,51 +0,0 @@
-# llama.cpp/example/batched-bench
-
-Benchmark the batched decoding performance of `llama.cpp`
-
-## Usage
-
-There are 2 modes of operation:
-
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
-
-```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
-
-# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
-
-# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
-
-# custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
-```
-
-## Sample results
-
- `PP` - prompt tokens per batch
- `TG` - generated tokens per batch
- `B` - number of batches
- `N_KV` - required KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
- `T` - total time
- `S` - total speed (i.e. all tokens / total time)
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
-|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
-|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
-|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
-|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
-|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
-|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
-|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
-|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
-|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
-|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
-|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,243 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
-        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
-        return 1 ;
-    }
-
-    int n_kv_max     = 2048;
-    int is_pp_shared = 0;
-    int n_gpu_layers = 0;
-    int mmq          = 0;
-
-    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
-    std::vector<int> n_tg = { 128, 256, };
-    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
-    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_kv_max = std::atoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        is_pp_shared = std::atoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        n_gpu_layers = std::atoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        mmq = std::atoi(argv[5]);
-    }
-
-    if (argc >= 7) {
-        n_pp = parse_list(argv[6]);
-    }
-
-    if (argc >= 8) {
-        n_tg = parse_list(argv[7]);
-    }
-
-    if (argc >= 9) {
-        n_pl = parse_list(argv[8]);
-    }
-
-    // init LLM
-
-    llama_backend_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = llama_model_default_params();
-
-    model_params.n_gpu_layers = n_gpu_layers;
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
-    ctx_params.mul_mat_q = mmq;
-
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
-
-    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
-                return false;
-            }
-        }
-
-        return true;
-    };
-
-    // warm up
-    {
-        for (int i = 0; i < 16; ++i) {
-            llama_batch_add(batch, 0, i, { 0 }, false);
-        }
-
-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-    }
-
-    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
-    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
-
-    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
-        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
-            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
-                const int pp = n_pp[i_pp];
-                const int tg = n_tg[i_tg];
-                const int pl = n_pl[i_pl];
-
-                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
-
-                if (n_ctx_req > n_kv_max) {
-                    continue;
-                }
-
-                llama_batch_clear(batch);
-
-                const int n_tokens = is_pp_shared ? pp : pl*pp;
-
-                for (int i = 0; i < n_tokens; ++i) {
-                    llama_batch_add(batch, 0, i, { 0 }, false);
-                }
-                batch.logits[batch.n_tokens - 1] = true;
-
-                const auto t_pp_start = ggml_time_us();
-
-                llama_kv_cache_tokens_rm(ctx, -1, -1);
-
-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
-                    return 1;
-                }
-
-                if (is_pp_shared) {
-                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
-                    }
-                }
-
-                const auto t_pp_end = ggml_time_us();
-
-                const auto t_tg_start = ggml_time_us();
-
-                for (int i = 0; i < tg; ++i) {
-                    llama_batch_clear(batch);
-
-                    for (int j = 0; j < pl; ++j) {
-                        llama_batch_add(batch, 0, pp + i, { j }, true);
-                    }
-
-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
-                        return 1;
-                    }
-                }
-
-                const auto t_tg_end = ggml_time_us();
-
-                const int32_t n_kv = n_ctx_req;
-
-                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
-                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
-                const float t    = t_pp + t_tg;
-
-                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
-                const float speed_tg = pl*tg / t_tg;
-                const float speed    = n_kv / t;
-
-                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
-            }
-        }
-    }
-
-    llama_print_timings(ctx);
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/examples/batched.swift/.gitignore
+++ b/examples/batched.swift/.gitignore
@@ -1,9 +0,0 @@
-.DS_Store
-/.build
-/Packages
-xcuserdata/
-DerivedData/
-.swiftpm/configuration/registries.json
-.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
-.netrc
-batched_swift
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@@ -1,6 +0,0 @@
-.PHONY: build
-
-build:
-	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./batched_swift
-	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@@ -1,22 +0,0 @@
-// swift-tools-version: 5.5
-// The swift-tools-version declares the minimum version of Swift required to build this package.
-
-import PackageDescription
-
-let package = Package(
-    name: "batched_swift",
-    platforms: [.macOS(.v12)],
-    dependencies: [
-        .package(name: "llama", path: "../../"),
-    ],
-    targets: [
-        // Targets are the basic building blocks of a package, defining a module or a test suite.
-        // Targets can depend on other targets in this package and products from dependencies.
-        .executableTarget(
-            name: "batched_swift",
-            dependencies: ["llama"],
-            path: "Sources",
-            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
-        ),
-    ]
-)
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +0,0 @@
-This is a swift clone of `examples/batched`.
-
-$ `make`
-$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -1,263 +0,0 @@
-import Foundation
-import llama
-
-let arguments = CommandLine.arguments
-
-// Check that we have at least one argument (the model path)
-guard arguments.count > 1 else {
-    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
-    exit(1)
-}
-
-let modelPath: String = arguments[1]
-let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
-let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
-
-// total length of the sequences including the prompt
-let n_len: Int = 32
-
-// init LLM
-llama_backend_init(false)
-defer {
-    llama_backend_free()
-}
-
-let model_params = llama_model_default_params()
-guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
-    print("Failed to load model")
-    exit(1)
-}
-
-defer {
-    llama_free_model(model)
-}
-
-var tokens = tokenize(text: prompt, add_bos: true)
-
-let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
-
-var context_params = llama_context_default_params()
-context_params.seed = 1234
-context_params.n_ctx = n_kv_req
-context_params.n_batch = UInt32(max(n_len, n_parallel))
-context_params.n_threads = 8
-context_params.n_threads_batch = 8
-
-let context = llama_new_context_with_model(model, context_params)
-guard context != nil else {
-    print("Failed to initialize context")
-    exit(1)
-}
-
-defer {
-    llama_free(context)
-}
-
-let n_ctx = llama_n_ctx(context)
-
-print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
-
-if n_kv_req > n_ctx {
-    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
-    exit(1)
-}
-
-var buffer: [CChar] = []
-for id: llama_token in tokens {
-    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
-}
-
-print("\n")
-
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
-defer {
-    llama_batch_free(batch)
-}
-
-// evaluate the initial prompt
-batch.n_tokens = Int32(tokens.count)
-
-for (i, token) in tokens.enumerated() {
-    batch.token[i] = token
-    batch.pos[i] = Int32(i)
-    batch.n_seq_id[i] = 1
-    // batch.seq_id[i][0] = 0
-    // TODO: is this the proper way to do this?
-    if let seq_id = batch.seq_id[i] {
-        seq_id[0] = 0
-    }
-    batch.logits[i] = 0
-}
-
-// llama_decode will output logits only for the last token of the prompt
-batch.logits[Int(batch.n_tokens) - 1] = 1
-
-if llama_decode(context, batch) != 0 {
-    print("llama_decode() failed")
-    exit(1)
-}
-
-for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
-}
-
-if n_parallel > 1 {
-    print("generating \(n_parallel) sequences ...\n")
-}
-
-var streams: [String] = .init(repeating: "", count: n_parallel)
-var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
-var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
-
-var n_cur = batch.n_tokens
-var n_decode = 0
-
-let t_main_start = ggml_time_us()
-
-while n_cur <= n_len {
-    // prepare the next batch
-    batch.n_tokens = 0
-
-    // sample the next token for each parallel sequence / stream
-    for i in 0 ..< n_parallel {
-        if i_batch[i] < 0 {
-            // the stream has already finished
-            continue
-        }
-
-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
-
-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
-
-        for token_id in 0 ..< n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-
-        var candidates_p: llama_token_data_array = .init(
-            data: &candidates,
-            size: candidates.count,
-            sorted: false
-        )
-
-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sample_top_k(context, &candidates_p, top_k, 1)
-        llama_sample_top_p(context, &candidates_p, top_p, 1)
-        llama_sample_temp(context, &candidates_p, temp)
-
-        let new_token_id = llama_sample_token(context, &candidates_p)
-
-        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(context) || n_cur == n_len {
-            i_batch[i] = -1
-            // print("")
-            if n_parallel > 1 {
-                print("stream \(i) finished at n_cur = \(n_cur)")
-            }
-
-            continue
-        }
-
-        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
-
-        // if there is only one stream, we print immediately to stdout
-        if n_parallel == 1 {
-            print(nextStringPiece, terminator: "")
-        }
-        streams[i] += nextStringPiece
-
-        // push this new token for next evaluation
-        batch.token[Int(batch.n_tokens)] = new_token_id
-        batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.n_seq_id[Int(batch.n_tokens)] = 1
-        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
-            seq_id[0] = Int32(i)
-        }
-        batch.logits[Int(batch.n_tokens)] = 1
-
-        i_batch[i] = batch.n_tokens
-
-        batch.n_tokens += 1
-
-        n_decode += 1
-    }
-
-    // all streams are finished
-    if batch.n_tokens == 0 {
-        break
-    }
-
-    n_cur += 1
-
-    // evaluate the current batch with the transformer model
-    if llama_decode(context, batch) != 0 {
-        print("llama_decode() failed")
-        exit(1)
-    }
-}
-
-if n_parallel > 1 {
-    print("\n")
-    for (i, stream) in streams.enumerated() {
-        print("sequence \(i):\n\n\(prompt)\(stream)\n")
-    }
-}
-
-let t_main_end = ggml_time_us()
-
-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
-
-llama_print_timings(context)
-
-private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let n_tokens = text.count + (add_bos ? 1 : 0)
-    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
-    var swiftTokens: [llama_token] = []
-    for i in 0 ..< tokenCount {
-        swiftTokens.append(tokens[Int(i)])
-    }
-    tokens.deallocate()
-    return swiftTokens
-}
-
-private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
-    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
-    if nTokens < 0 {
-        if result.count >= -Int(nTokens) {
-            result.removeLast(-Int(nTokens))
-        } else {
-            result.removeAll()
-        }
-        let check = llama_token_to_piece(
-            model,
-            token,
-            &result,
-            Int32(result.count)
-        )
-        assert(check == nTokens)
-    } else {
-        result.removeLast(result.count - Int(nTokens))
-    }
-    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
-        return utfString
-    } else {
-        buffer.append(contentsOf: result)
-        let data = Data(buffer.map { UInt8(bitPattern: $0) })
-        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
-            buffer = []
-        }
-        guard let bufferString = String(data: data, encoding: .utf8) else {
-            return nil
-        }
-        buffer = []
-        return bufferString
-    }
-    return nil
-}
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -40,35 +40,20 @@ int main(int argc, char ** argv) {

    llama_backend_init(params.numa);

-    // initialize the model
+    llama_context_params ctx_params = llama_context_default_params();

-    llama_model_params model_params = llama_model_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = n_len*n_parallel; // FIXME: use n_kv_req instead (tokenize with model after #3301)
+    ctx_params.n_batch = std::max(n_len, n_parallel);
+    // ctx_params.n_gpu_layers = 99; // offload all layers to the GPU

-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    // tokenize the prompt
-
-    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize(model, params.prompt, true);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
-
-    // initialize the context
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = n_kv_req;
-    ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

    if (ctx == NULL) {
@@ -76,7 +61,13 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    // tokenize the prompt
+
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
+
    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %d, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);

@@ -97,20 +88,25 @@ int main(int argc, char ** argv) {

    fflush(stderr);

-    // create a llama_batch
+    // create a llama_batch with size 512
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0, 1);
+
+    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t)n_parallel), 0);

    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+    batch.n_tokens = tokens_list.size();
+
+    for (int32_t i = 0; i < batch.n_tokens; i++) {
+        batch.token[i]  = tokens_list[i];
+        batch.pos[i]    = i;
+        batch.seq_id[i] = 0;
+        batch.logits[i] = false;
    }
-    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());

    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

-    if (llama_decode(ctx, batch) != 0) {
+    if (llama_decode(ctx, batch, params.n_threads) != 0) {
        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }
@@ -141,7 +137,7 @@ int main(int argc, char ** argv) {

    while (n_cur <= n_len) {
        // prepare the next batch
-        llama_batch_clear(batch);
+        batch.n_tokens = 0;

        // sample the next token for each parallel sequence / stream
        for (int32_t i = 0; i < n_parallel; ++i) {
@@ -150,7 +146,7 @@ int main(int argc, char ** argv) {
                continue;
            }

-            auto   n_vocab = llama_n_vocab(model);
+            auto   n_vocab = llama_n_vocab(ctx);
            auto * logits  = llama_get_logits_ith(ctx, i_batch[i]);

            std::vector<llama_token_data> candidates;
@@ -193,10 +189,15 @@ int main(int argc, char ** argv) {

            streams[i] += llama_token_to_piece(ctx, new_token_id);

+            // push this new token for next evaluation
+            batch.token [batch.n_tokens] = new_token_id;
+            batch.pos   [batch.n_tokens] = n_cur;
+            batch.seq_id[batch.n_tokens] = i;
+            batch.logits[batch.n_tokens] = true;
+
            i_batch[i] = batch.n_tokens;

-            // push this new token for next evaluation
-            llama_batch_add(batch, new_token_id, n_cur, { i }, true);
+            batch.n_tokens += 1;

            n_decode += 1;
        }
@@ -209,7 +210,7 @@ int main(int argc, char ** argv) {
        n_cur += 1;

        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (llama_decode(ctx, batch, params.n_threads)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -160,7 +160,7 @@ int main(int argc, char ** argv)

    int n_past = 0;

-    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
+    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0), params.n_threads))
    {
        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
        return 1;
@@ -170,7 +170,7 @@ int main(int argc, char ** argv)
    beam_search_callback_data callback_data{ctx, {}};
    size_t const beam_width = static_cast<size_t>(params.n_beams);
    int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
+    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);

    std::cout << "\n\n";
    for (llama_token const token_id : callback_data.response) {
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -9,7 +9,7 @@ if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
    exit 1
 fi

-MODEL="${MODEL:-./models/llama-13b/ggml-model-q4_0.gguf}"
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
 PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
 USER_NAME="${USER_NAME:-User}"
 AI_NAME="${AI_NAME:-ChatLLaMa}"
@@ -61,9 +61,9 @@ fi

 if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
    echo 'Prompt cache does not exist, building...'
-    # Default batch_size to 64 here for better user feedback during initial prompt processing
+    # Default batch_size to 8 here for better user feedback during initial prompt processing
    ./main 2>>"$LOG" \
-        --batch_size 64 \
+        --batch_size 8 \
        "${OPTS[@]}" \
        --prompt-cache "$PROMPT_CACHE_FILE" \
        --file "$CUR_PROMPT_FILE" \
@@ -132,7 +132,7 @@ while read -e line; do
    # HACK get num tokens from debug message
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
-        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+        ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
        echo >&2 "Couldn't get number of tokens from ./main output!"
        exit 1
    fi
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -48,7 +48,8 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }
    struct MyModel * ret = new MyModel();
    ret->ctx = ctx;
@@ -70,7 +71,7 @@ bool eval_float(void * model, float * input, int N){
    MyModel * mymodel = (MyModel*)model;
    llama_context * ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
-    int n_emb = llama_n_embd(llama_get_model(ctx));
+    int n_emb = llama_n_embd(ctx);
    int n_past = mymodel->n_past;
    int n_batch = N; // params.n_batch;

@@ -79,8 +80,8 @@ bool eval_float(void * model, float * input, int N){
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
-        if (llama_decode(ctx, batch)) {
+        llama_batch batch = {  int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, n_past, 1, 0, };
+        if (llama_decode(ctx, batch, params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
@@ -101,7 +102,7 @@ bool eval_tokens(void * model, std::vector<llama_token> tokens) {
        if (n_eval > params.n_batch) {
            n_eval = params.n_batch;
        }
-        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
+        if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return false;
        }
@@ -128,31 +129,30 @@ bool eval_string(struct MyModel * mymodel,const char* str){
 llama_token sampling_id(struct MyModel* mymodel) {
    llama_context* ctx = mymodel->ctx;
    gpt_params params = mymodel->params;
-    llama_sampling_params & sparams = params.sampling_params;
    // int n_ctx = llama_n_ctx(ctx);

    // out of user input, sample next token
-    const float   temp            = sparams.temp;
-    const int32_t top_k           = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
-    const float   top_p           = sparams.top_p;
-    const float   tfs_z           = sparams.tfs_z;
-    const float   typical_p       = sparams.typical_p;
+    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
    // const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
    // const float   repeat_penalty  = params.repeat_penalty;
    // const float   alpha_presence  = params.presence_penalty;
    // const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = sparams.mirostat;
-    const float   mirostat_tau    = sparams.mirostat_tau;
-    const float   mirostat_eta    = sparams.mirostat_eta;
+    const int     mirostat        = params.mirostat;
+    const float   mirostat_tau    = params.mirostat_tau;
+    const float   mirostat_eta    = params.mirostat_eta;
    // const bool    penalize_nl     = params.penalize_nl;

    llama_token id = 0;
    {
        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+        auto n_vocab = llama_n_vocab(ctx);

        // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
            logits[it->first] += it->second;
        }

--- a/examples/embd-input/embd-input-test.cpp
+++ b/examples/embd-input/embd-input-test.cpp
@@ -8,7 +8,7 @@ int main(int argc, char** argv) {
    auto mymodel = create_mymodel(argc, argv);
    int N = 10;
    int max_tgt_len = 500;
-    int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
+    int n_embd = llama_n_embd(mymodel->ctx);

    // add random float embd to test evaluation
    float * data = new float[N*n_embd];
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -42,18 +42,17 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-
-    if (n_ctx > n_ctx_train) {
+    const int n_ctx_train = llama_n_ctx_train(ctx);
+    if (params.n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

    int n_past = 0;
@@ -71,15 +70,15 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "\n");
    }

-    if (embd_inp.size() > (size_t)n_ctx) {
+    if (embd_inp.size() > (size_t)params.n_ctx) {
        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
-                __func__, embd_inp.size(), n_ctx);
+                __func__, embd_inp.size(), params.n_ctx);
        return 1;
    }

    while (!embd_inp.empty()) {
        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
-        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0))) {
+        if (llama_decode(ctx, llama_batch_get_one(embd_inp.data(), n_tokens, n_past, 0), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }
@@ -87,8 +86,8 @@ int main(int argc, char ** argv) {
        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
    }

-    const int n_embd = llama_n_embd(model);
-    const auto * embeddings = llama_get_embeddings(ctx);
+    const int n_embd = llama_n_embd(ctx);
+    const auto embeddings = llama_get_embeddings(ctx);

    for (int i = 0; i < n_embd; i++) {
        printf("%f ", embeddings[i]);
--- a/examples/export-lora/CMakeLists.txt
+++ b/examples/export-lora/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET export-lora)
-add_executable(${TARGET} export-lora.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -1,26 +0,0 @@
-# export-lora
-
-Apply LORA adapters to base model and export the resulting model.
-
-```
-usage: export-lora [options]
-
-options:
-  -h, --help                         show this help message and exit
-  -m FNAME, --model-base FNAME       model path from which to load base model (default '')
-  -o FNAME, --model-out FNAME        path to save exported model (default '')
-  -l FNAME, --lora FNAME             apply LoRA adapter
-  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S
-  -t N, --threads N                  number of threads to use during computation (default: 4)
-```
-
-For example:
-
-```bash
-./bin/export-lora \
-    -m open-llama-3b-v2-q8_0.gguf \
-    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    -l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
-```
-
-Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,474 +0,0 @@
-
-#include "common.h"
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#include <vector>
-#include <string>
-#include <thread>
-
-static const size_t tensor_alignment = 32;
-
-struct lora_info {
-    std::string filename;
-    float scale;
-};
-
-struct export_lora_params {
-    std::string fn_model_base;
-    std::string fn_model_out;
-    std::vector<struct lora_info> lora;
-    int n_threads;
-};
-
-struct lora_data {
-    struct lora_info     info;
-    std::vector<uint8_t> data;
-    struct ggml_context * ctx;
-
-    uint32_t lora_r;
-    uint32_t lora_alpha;
-};
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            size = 0;
-        } else {
-            seek(0, SEEK_END);
-            size = tell();
-            seek(0, SEEK_SET);
-        }
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GGML_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GGML_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            die_fmt("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            die_fmt("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    bool eof() {
-        return tell() >= size;
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-static struct export_lora_params get_default_export_lora_params() {
-    struct export_lora_params result;
-    result.fn_model_base = "";
-    result.fn_model_out  = "";
-    result.n_threads = GGML_DEFAULT_N_THREADS;
-    return result;
-}
-
-static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                         show this help message and exit\n");
-    fprintf(stderr, "  -m FNAME, --model-base FNAME       model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
-    fprintf(stderr, "  -o FNAME, --model-out FNAME        path to save exported model (default '%s')\n", params->fn_model_out.c_str());
-    fprintf(stderr, "  -l FNAME, --lora FNAME             apply LoRA adapter\n");
-    fprintf(stderr, "  -s FNAME S, --lora-scaled FNAME S  apply LoRA adapter with user defined scaling S\n");
-    fprintf(stderr, "  -t N, --threads N                  number of threads to use during computation (default: %d)\n", params->n_threads);
-}
-
-static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
-    bool invalid_param = false;
-    std::string arg;
-    struct export_lora_params default_params = get_default_export_lora_params();
-    const std::string arg_prefix = "--";
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-
-        if (arg == "-m" || arg == "--model-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_base = argv[i];
-        } else if (arg == "-o" || arg == "--model-out") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->fn_model_out = argv[i];
-        } else if (arg == "-l" || arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            lora.scale = 1.0f;
-            params->lora.push_back(lora);
-        } else if (arg == "-s" || arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            struct lora_info lora;
-            lora.filename = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            lora.scale = std::stof(argv[i]);
-            params->lora.push_back(lora);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params->n_threads = std::stoi(argv[i]);
-            if (params->n_threads <= 0) {
-                params->n_threads = std::thread::hardware_concurrency();
-            }
-        } else {
-            fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
-            export_lora_print_usage(argc, argv, &default_params);
-            exit(1);
-        }
-    }
-
-    if (params->fn_model_base == default_params.fn_model_base) {
-        fprintf(stderr, "error: please specify a filename for model-base.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (params->fn_model_out == default_params.fn_model_out) {
-        fprintf(stderr, "error: please specify a filename for model-out.\n");
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
-        export_lora_print_usage(argc, argv, &default_params);
-        exit(1);
-    }
-    return true;
-}
-
-static void free_lora(struct lora_data * lora) {
-    if (lora->ctx != NULL) {
-        ggml_free(lora->ctx);
-    }
-    delete lora;
-}
-
-static struct lora_data * load_lora(struct lora_info * info) {
-    struct lora_data * result = new struct lora_data;
-    result->info = *info;
-    result->ctx = NULL;
-    result->lora_r     = 1;
-    result->lora_alpha = 1;
-
-    struct llama_file file(info->filename.c_str(), "rb");
-    if (file.fp == NULL) {
-        fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
-            info->filename.c_str());
-        free_lora(result);
-        return NULL;
-    }
-
-    struct ggml_init_params params_ggml;
-    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_MAX_NODES;
-    params_ggml.mem_buffer = NULL;
-    params_ggml.no_alloc   = true;
-    result->ctx = ggml_init(params_ggml);
-
-    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
-    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_LORA) {
-        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
-    }
-    uint32_t version = file.read_u32();
-    if (version != 1) {
-        die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
-    }
-    result->lora_r     = file.read_u32();
-    result->lora_alpha = file.read_u32();
-    // read tensor infos from file
-    std::vector<char> name_buf;
-    std::vector<struct ggml_tensor *> tensors;
-    std::vector<size_t> tensors_offset;
-    size_t total_nbytes_pad = 0;
-    while(!file.eof()) {
-        int64_t ne[4]   = {1,1,1,1};
-        uint32_t n_dims  = file.read_u32();
-        uint32_t namelen = file.read_u32();
-        uint32_t type    = file.read_u32();
-        for (uint32_t k = 0; k < n_dims; ++k) {
-            ne[k] = (int64_t)file.read_u32();
-        }
-        name_buf.clear();
-        name_buf.resize(namelen + 1, '\0');
-        file.read_raw(name_buf.data(), namelen);
-        file.seek((0-file.tell()) & 31, SEEK_CUR);
-        size_t offset = file.tell();
-        struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
-        ggml_set_name(tensor, name_buf.data());
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        total_nbytes_pad += nbytes_pad;
-        tensors.push_back(tensor);
-        tensors_offset.push_back(offset);
-        file.seek(nbytes, SEEK_CUR);
-    }
-    // read tensor data
-    result->data.resize(total_nbytes_pad);
-    size_t data_offset = 0;
-    for (size_t i = 0; i < tensors.size(); ++i) {
-        struct ggml_tensor * tensor = tensors[i];
-        size_t offset     = tensors_offset[i];
-        size_t nbytes     = ggml_nbytes(tensor);
-        size_t nbytes_pad = ggml_nbytes_pad(tensor);
-        file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
-    }
-    return result;
-}
-
-
-static struct ggml_cgraph * build_graph_lora(
-    struct ggml_context * ctx,
-    struct ggml_tensor * tensor,
-    struct ggml_tensor * lora_a,
-    struct ggml_tensor * lora_b,
-    float scaling
-) {
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
-    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
-    }
-    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand (gf, res);
-    return gf;
-}
-
-static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
-    if (lora->ctx == NULL) {
-        return false;
-    }
-    std::string name = ggml_get_name(tensor);
-    std::string name_a = name + std::string(".loraA");
-    std::string name_b = name + std::string(".loraB");
-    struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
-    struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
-    if (lora_a == NULL || lora_b == NULL) {
-        return false;
-    }
-
-    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
-
-    struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
-    params.mem_buffer = NULL;
-    params.no_alloc   = true;
-    struct ggml_context * ctx = NULL;
-    struct ggml_allocr * alloc = NULL;
-    struct ggml_cgraph * gf = NULL;
-
-    ctx   = ggml_init(params);
-    alloc = ggml_allocr_new_measure(tensor_alignment);
-    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
-    size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
-    ggml_allocr_free(alloc);
-    ggml_free(ctx);
-
-    static std::vector<uint8_t> data_compute;
-    data_compute.resize(alloc_size + tensor_alignment);
-
-    ctx   = ggml_init(params);
-    alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
-    gf    = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
-    ggml_allocr_alloc_graph(alloc, gf);
-    ggml_allocr_free(alloc);
-
-    struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
-    static std::vector<uint8_t> data_work;
-    data_work.resize(cplan.work_size);
-    cplan.work_data = data_work.data();
-
-    ggml_graph_compute(gf, &cplan);
-
-    ggml_free(ctx);
-    return true;
-}
-
-static void export_lora(struct export_lora_params * params) {
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < params->lora.size(); ++i) {
-        struct lora_data * lora = load_lora(&params->lora[i]);
-        if (lora != NULL) {
-            loras.push_back(lora);
-        }
-    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-
-    // open input file
-    struct llama_file fin(params->fn_model_base.c_str(), "rb");
-    if (!fin.fp) {
-        die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
-    }
-
-    // open base model gguf, read tensors without their data
-    struct ggml_context * ctx_in;
-    struct gguf_init_params params_gguf;
-    params_gguf.no_alloc = true;
-    params_gguf.ctx      = &ctx_in;
-    struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
-
-    // create new gguf
-    struct gguf_context * gguf_out = gguf_init_empty();
-
-    // copy meta data from base model: kv and tensors
-    gguf_set_kv(gguf_out, gguf_in);
-    int n_tensors = gguf_get_n_tensors(gguf_in);
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-        gguf_add_tensor(gguf_out, tensor);
-    }
-
-    // create output file
-    struct llama_file fout(params->fn_model_out.c_str(), "wb");
-    if (!fout.fp) {
-        die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
-    }
-
-    // write gguf meta data
-    std::vector<uint8_t> meta;
-    meta.resize(gguf_get_meta_size(gguf_out));
-    gguf_get_meta_data(gguf_out, meta.data());
-    fout.write_raw(meta.data(), meta.size());
-
-    std::vector<uint8_t> data;
-    std::vector<uint8_t> padding;
-    for (int i=0; i < n_tensors; ++i) {
-        const char * name = gguf_get_tensor_name(gguf_in, i);
-        struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
-
-        // read tensor data
-        data.resize(ggml_nbytes(tensor));
-        tensor->data = data.data();
-        size_t offset = gguf_get_tensor_offset(gguf_in, i);
-        fin.seek(offset + meta.size(), SEEK_SET);
-        fin.read_raw(data.data(), data.size());
-
-        // apply all loras
-        for (size_t k = 0; k < loras.size(); ++k) {
-            apply_lora(tensor, loras[k], params->n_threads);
-        }
-
-        // write tensor data + padding
-        padding.clear();
-        padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
-
-        GGML_ASSERT(fout.tell() == offset + meta.size());
-        // fout.seek(offset + meta.size(), SEEK_SET);
-        fout.write_raw(data.data(), data.size());
-        fout.write_raw(padding.data(), padding.size());
-
-        if (i % 2 == 0) {
-            printf(".");
-        }
-    }
-    printf("\n");
-
-    // close gguf
-    gguf_free(gguf_out);
-    gguf_free(gguf_in);
-
-    // free loras
-    for (size_t i = 0; i < loras.size(); ++i) {
-        free_lora(loras[i]);
-    }
-}
-
-int main(int argc, char ** argv) {
-    struct export_lora_params params = get_default_export_lora_params();
-
-    if (!export_lora_params_parse(argc, argv, &params)) {
-        return 1;
-    }
-
-    export_lora(&params);
-
-    return 0;
-}
--- a/examples/finetune/CMakeLists.txt
+++ b/examples/finetune/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET finetune)
-add_executable(${TARGET} finetune.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -1,90 +0,0 @@
-# finetune
-
-Basic usage instructions:
-
-```bash
-# get training data
-wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
-
-# finetune LORA adapter
-./bin/finetune \
-        --model-base open-llama-3b-v2-q8_0.gguf \
-        --checkpoint-in  chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
-        --lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
-        --train-data "shakespeare.txt" \
-        --save-every 10 \
-        --threads 6 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
-
-# predict
-./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-```
-
-Finetune output files will be saved every N iterations (config with `--save-every N`).
-The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
-So in above example after 10 iterations these files will be written:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-After 10 more iterations:
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
-
-Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
-
-llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
-These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
-
-In `main` you can also load multiple LORA adapters, which will then be mixed together.
-
-For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
-
-```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
-  --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
-  --lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
-```
-
-You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
-
-For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
-
-```bash
-./bin/main -m open-llama-3b-v2-q8_0.gguf \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
-  --lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
-  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
-```
-
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
-
-Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
-If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
-
-The default LORA rank can be specified with `--lora-r N`.
-The LORA rank can be configured for each model tensor type separately with these command line options:
-
-```bash
-  --lora-r N                 LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
-  --rank-att-norm N          LORA rank for attention norm tensor (default 1)
-  --rank-ffn-norm N          LORA rank for feed-forward norm tensor (default 1)
-  --rank-out-norm N          LORA rank for output norm tensor (default 1)
-  --rank-tok-embd N          LORA rank for token embeddings tensor (default 4)
-  --rank-out N               LORA rank for output tensor (default 4)
-  --rank-wq N                LORA rank for wq tensor (default 4)
-  --rank-wk N                LORA rank for wk tensor (default 4)
-  --rank-wv N                LORA rank for wv tensor (default 4)
-  --rank-wo N                LORA rank for wo tensor (default 4)
-  --rank-w1 N                LORA rank for w1 tensor (default 4)
-  --rank-w2 N                LORA rank for w2 tensor (default 4)
-  --rank-w3 N                LORA rank for w3 tensor (default 4)
-```
-
-The LORA rank of 'norm' tensors should always be 1.
-
-To see all available options use `finetune --help`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@@ -1,489 +0,0 @@
-#!/usr/bin/env python3
-# finetune checkpoint --> gguf conversion
-
-import argparse
-import gguf
-import os
-import struct
-import sys
-import numpy as np
-from pathlib import Path
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
-
-LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD  = "training.lora.rank.token_embd"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
-LLM_KV_TRAINING_LORA_RANK_OUTPUT      = "training.lora.rank.output"
-LLM_KV_TRAINING_LORA_RANK_ATTN_NORM   = "training.lora.rank.attn_norm"
-LLM_KV_TRAINING_LORA_RANK_ATTN_Q      = "training.lora.rank.attn_q"
-LLM_KV_TRAINING_LORA_RANK_ATTN_K      = "training.lora.rank.attn_k"
-LLM_KV_TRAINING_LORA_RANK_ATTN_V      = "training.lora.rank.attn_v"
-LLM_KV_TRAINING_LORA_RANK_ATTN_OUT    = "training.lora.rank.attn_output"
-LLM_KV_TRAINING_LORA_RANK_FFN_NORM    = "training.lora.rank.ffn_norm"
-LLM_KV_TRAINING_LORA_RANK_FFN_GATE    = "training.lora.rank.ffn_gate"
-LLM_KV_TRAINING_LORA_RANK_FFN_DOWN    = "training.lora.rank.ffn_down"
-LLM_KV_TRAINING_LORA_RANK_FFN_UP      = "training.lora.rank.ffn_up"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.product(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        if tuple(ne) != tuple(self.ne):
-            raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version != 1:
-            raise ValueError('Invalid version of optimization context in checkpoint file')
-
-        self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-        self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-        self.adam_m  = Tensor('f', [self.nx])
-        self.adam_v  = Tensor('f', [self.nx])
-        self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-        self.lbfgs_x    = Tensor('f', [self.nx])
-        self.lbfgs_xp   = Tensor('f', [self.nx])
-        self.lbfgs_g    = Tensor('f', [self.nx])
-        self.lbfgs_gp   = Tensor('f', [self.nx])
-        self.lbfgs_d    = Tensor('f', [self.nx])
-        self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-        self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-        self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-        self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-        # forgot to save type in version 1:
-        # guess self.type from number of remaining bytes
-        size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                [self.adam_m, self.adam_v]
-                                +([self.adam_pf] if (self.past > 0) else [])])
-        size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                 self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                 self.lbfgs_lmal, self.lbfgs_lmys,
-                                 self.lbfgs_lms, self.lbfgs_lmy]
-                                 +([self.lbfgs_pf] if (self.past > 0) else [])])
-        # due to alignment padding the size might not by exact
-        # but the difference in size for both types is significant,
-        # so we can just use whichever is closest
-        remaining = len(data) - offset
-        if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-            self.type = 0
-        else:
-            self.type = 1
-
-        if self.type == 0:
-            offset = self.adam_m.load(data, offset)
-            offset = self.adam_v.load(data, offset)
-            offset = self.adam_pf.load(data,offset)
-
-            self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        elif self.type == 1:
-            offset = self.lbfgs_x.load(data, offset)
-            offset = self.lbfgs_xp.load(data, offset)
-            offset = self.lbfgs_g.load(data, offset)
-            offset = self.lbfgs_gp.load(data, offset)
-            offset = self.lbfgs_d.load(data, offset)
-            offset = self.lbfgs_pf.load(data, offset)
-            offset = self.lbfgs_lmal.load(data, offset)
-            offset = self.lbfgs_lmys.load(data, offset)
-            offset = self.lbfgs_lms.load(data, offset)
-            offset = self.lbfgs_lmy.load(data, offset)
-
-            self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError(f"Invalid optimizer type '{self.type}'")
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class LoraParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_rank_attention_norm  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wq              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wk              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wv              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_wo              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_ffn_norm        = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w1              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w2              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_w3              = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_tok_embeddings  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_norm            = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rank_output          = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD,  self.n_rank_tok_embeddings)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT,      self.n_rank_output)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM,   self.n_rank_attention_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q,      self.n_rank_wq)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K,      self.n_rank_wk)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V,      self.n_rank_wv)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT,    self.n_rank_wo)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM,    self.n_rank_ffn_norm)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE,    self.n_rank_w1)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN,    self.n_rank_w2)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP,      self.n_rank_w3)
-
-class ModelParams:
-    def __init__(self, n_ff = None):
-        self.n_ff = n_ff
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        if self.n_ff is None:
-            # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-            return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-        else:
-            return self.n_ff
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None, suffix=".weight"):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
-
-class Layer:
-    def __init__(self, params, lora_params, bid):
-        self.bid = bid
-        self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
-        self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
-        self.wq_a       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wq_b       = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
-        self.wk_a       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wk_b       = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
-        self.wv_a       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wv_b       = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
-        self.wo_a       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.wo_b       = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
-        self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
-        self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
-        self.w1_a       = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
-        self.w1_b       = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
-        self.w2_a       = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
-        self.w2_b       = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
-        self.w3_a       = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
-        self.w3_b       = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm_a.load(data, offset)
-        offset = self.att_norm_b.load(data, offset)
-        offset = self.wq_a.load(data, offset)
-        offset = self.wq_b.load(data, offset)
-        offset = self.wk_a.load(data, offset)
-        offset = self.wk_b.load(data, offset)
-        offset = self.wv_a.load(data, offset)
-        offset = self.wv_b.load(data, offset)
-        offset = self.wo_a.load(data, offset)
-        offset = self.wo_b.load(data, offset)
-        offset = self.ffn_norm_a.load(data, offset)
-        offset = self.ffn_norm_b.load(data, offset)
-        offset = self.w1_a.load(data, offset)
-        offset = self.w1_b.load(data, offset)
-        offset = self.w2_a.load(data, offset)
-        offset = self.w2_b.load(data, offset)
-        offset = self.w3_a.load(data, offset)
-        offset = self.w3_b.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
-        self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
-        self.wq_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_a"))
-        self.wq_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid, ".weight.lora_b"))
-        self.wk_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_a"))
-        self.wk_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid, ".weight.lora_b"))
-        self.wv_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_a"))
-        self.wv_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid, ".weight.lora_b"))
-        self.wo_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_a"))
-        self.wo_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid, ".weight.lora_b"))
-        self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_a"))
-        self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid, ".weight.lora_b"))
-        self.w1_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_a"))
-        self.w1_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid, ".weight.lora_b"))
-        self.w2_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_a"))
-        self.w2_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid, ".weight.lora_b"))
-        self.w3_a.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_a"))
-        self.w3_b.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid, ".weight.lora_b"))
-
-class LoraModel:
-    def __init__(self, n_ff = None):
-        self.params = ModelParams(n_ff = n_ff)
-        self.lora_params = LoraParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-        offset = self.lora_params.load(data, offset)
-
-        self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
-        self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
-        self.norm_a     = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
-        self.norm_b     = Tensor('f', [self.lora_params.n_rank_norm, 1])
-        self.output_a   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
-        self.output_b   = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
-
-        offset = self.tok_embd_a.load(data, offset)
-        offset = self.tok_embd_b.load(data, offset)
-        offset = self.norm_a.load(data, offset)
-        offset = self.norm_b.load(data, offset)
-        offset = self.output_a.load(data, offset)
-        offset = self.output_b.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, self.lora_params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-        self.lora_params.save_gguf(gguf_writer)
-
-        self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_a"))
-        self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD,  suffix=".weight.lora_b"))
-        self.norm_a.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
-        self.norm_b.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
-        self.output_a.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_a"))
-        self.output_b.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT,      suffix=".weight.lora_b"))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class LoraCheckpoint:
-    def __init__(self, n_ff = None):
-        self.model = LoraModel(n_ff = n_ff)
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcl':
-            raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
-    parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    print(cfg)
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = LoraCheckpoint(n_ff = cfg.ff)
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -1,8 +0,0 @@
-set(TARGET infill)
-add_executable(${TARGET} infill.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@@ -1,41 +0,0 @@
-# llama.cpp/example/infill
-
-This example shows how to use the infill mode with Code Llama models supporting infill mode.
-Currently the 7B and 13B models support infill mode.
-
-Infill supports most of the options available in the main example.
-
-For further information have a look at the main README.md in llama.cpp/example/main/README.md
-
-## Common Options
-
-In this section, we cover the most commonly used options for running the `infill` program with the LLaMA models:
-
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
-
-## Input Prompts
-
-The `infill` program provides several ways to interact with the LLaMA models using input prompts:
-
-   `--in-prefix PROMPT_BEFORE_CURSOR`: Provide the prefix directly as a command-line option.
-   `--in-suffix PROMPT_AFTER_CURSOR`: Provide the suffix directly as a command-line option.
-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-
-## Interaction
-
-The `infill` program offers a seamless way to interact with LLaMA models, allowing users to receive real-time infill suggestions. The interactive mode can be triggered using `--interactive`, and `--interactive-first`
-
-### Interaction Options
-
-   `-i, --interactive`: Run the program in interactive mode, allowing users to get real time code suggestions from model.
-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
-
-### Example
-
-```bash
-./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n    print(\"hell" --in-suffix "\n   print(\"goodbye world\")\n    "
-```
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,792 +0,0 @@
-#include "common.h"
-
-#include "console.h"
-#include "llama.h"
-#include "build-info.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
-static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
-static bool is_interacting = false;
-
-
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: infill\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void sigint_handler(int signo) {
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting = true;
-        } else {
-            console::cleanup();
-            printf("\n");
-            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
-            _exit(130);
-        }
-    }
-}
-#endif
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    llama_sampling_params & sparams = params.sampling_params;
-    g_params = &params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
-
-    if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-    if (params.instruct) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.antiprompt.empty()) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (params.random_prompt) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.path_prompt_cache.empty()) {
-        printf("\n************\n");
-        printf("%s: infill does not support prompt caching\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
-    }
-
-    if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
-    }
-
-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    LOG("%s: llama backend init\n", __func__);
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
-    g_model = &model;
-    g_ctx = &ctx;
-
-    // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
-
-    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
-    if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
-    }
-
-    // print system information
-    {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
-    }
-    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
-    LOG("add_bos: %d\n", add_bos);
-
-    bool suff_rm_leading_spc = params.escape;
-    if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-        params.input_suffix.erase(0, 1);
-        suff_rm_leading_spc = false;
-    }
-    std::vector<llama_token> embd_inp;
-    std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-    std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-    const int space_token = 29871;
-    if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-        inp_sfx.erase(inp_sfx.begin());
-    }
-    inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
-    if (add_bos) {
-        inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
-    }
-    inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
-    embd_inp = inp_pfx;
-    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-    embd_inp.push_back(llama_token_middle(ctx));
-
-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-
-    // Should not run without any tokens
-    if (embd_inp.empty()) {
-        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
-    }
-
-    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-
-    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
-        params.n_keep = (int)embd_inp.size();
-    }
-
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
-
-    // enable interactive mode if interactive start is specified
-    if (params.interactive_first) {
-        params.interactive = true;
-    }
-
-    if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
-        }
-
-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
-        }
-
-        if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
-            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
-            }
-            LOG_TEE("'\n");
-        }
-        LOG_TEE("\n");
-    }
-
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
-            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
-        };
-        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
-#endif
-
-        LOG_TEE("%s: interactive mode on.\n", __func__);
-
-        if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
-        }
-
-        if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
-        }
-
-        if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-        }
-    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
-
-    struct llama_grammar * grammar = NULL;
-    grammar_parser::parse_state parsed_grammar;
-
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-        LOG_TEE("%s: grammar:\n", __func__);
-        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
-
-        {
-            auto it = sparams.logit_bias.find(llama_token_eos(ctx));
-            if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
-            }
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar = llama_grammar_init(
-            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
-    LOG_TEE("\n#####  Infill mode  #####\n\n");
-    if (params.infill) {
-        printf("\n************\n");
-        printf("no need to specify '--infill', always running infill\n");
-        printf("************\n\n");
-    }
-    if (params.interactive) {
-        const char *control_message;
-        if (params.multiline_input) {
-            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n";
-        } else {
-            control_message = " - Press Return to return control to LLaMa.\n"
-                              " - To return control without starting a new line, end your input with '/'.\n"
-                              " - If you want to submit another line, end your input with '\\'.\n";
-        }
-        LOG_TEE("== Running in interactive mode. ==\n");
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
-#endif
-        LOG_TEE(       "%s\n", control_message);
-
-        is_interacting = params.interactive_first;
-    }
-
-    bool input_echo           = true;
-
-    int n_past             = 0;
-    int n_remain           = params.n_predict;
-    int n_consumed         = 0;
-    int n_past_guidance    = 0;
-
-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
-    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
-    std::ostringstream output_ss;     g_output_ss     = &output_ss;
-
-    // the first thing we will do is to output the prompt, so set color accordingly
-    console::set_display(console::prompt);
-
-    std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
-
-    while (n_remain != 0 || params.interactive) {
-        // predict
-        if (!embd.empty()) {
-            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
-            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
-
-            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int) embd.size() > max_embd_size) {
-                const int skipped_tokens = (int) embd.size() - max_embd_size;
-                embd.resize(max_embd_size);
-
-                console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console::set_display(console::reset);
-                fflush(stdout);
-            }
-
-            // infinite text generation via context swapping
-            // if we run out of context:
-            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
-                    break;
-                }
-
-                const int n_left    = n_past - params.n_keep - 1;
-                const int n_discard = n_left/2;
-
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
-
-                n_past -= n_discard;
-
-                if (ctx_guidance) {
-                    n_past_guidance -= n_discard;
-                }
-
-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
-
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-            }
-
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
-
-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
-                    return 1;
-                }
-
-                n_past += n_eval;
-
-                LOG("n_past = %d\n", n_past);
-            }
-
-        }
-
-        embd.clear();
-        embd_guidance.clear();
-
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
-
-            llama_sampling_accept(ctx_sampling, ctx, id);
-
-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
-
-            embd.push_back(id);
-
-            // echo this to console
-            input_echo = true;
-
-            // decrement remaining sampling budget
-            --n_remain;
-
-            LOG("n_remain: %d\n", n_remain);
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        if (input_echo) {
-            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
-
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
-                    output_tokens.push_back(id);
-                    output_ss << token_str;
-                }
-            }
-            fflush(stdout);
-        }
-        // reset color to default if we there is no pending user input
-        if (input_echo && (int) embd_inp.size() == n_consumed) {
-            console::set_display(console::reset);
-        }
-
-        // if not currently processing queued inputs;
-        if ((int) embd_inp.size() <= n_consumed) {
-
-            // deal with eot token in infill mode
-            if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
-                if(is_interacting && !params.interactive_first) {
-                    // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
-                }
-                fflush(stdout);
-                printf("\n");
-                console::set_display(console::user_input);
-                std::string buffer;
-                std::string line;
-                bool another_line=true;
-                // set a new prefix via stdin
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-                // check if we got an empty line, if so we use the old input
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
-                    params.input_prefix = buffer;
-                }
-                buffer.clear();
-                // set a new suffix via stdin
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-                // check if we got an empty line
-                if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
-                    params.input_suffix = buffer;
-                }
-                buffer.clear();
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                if (params.escape) {
-                    //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
-                    process_escapes(params.input_prefix);
-                    process_escapes(params.input_suffix);
-                }
-                suff_rm_leading_spc = params.escape;
-                if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-                    params.input_suffix.erase(0, 1);
-                    suff_rm_leading_spc = false;
-                }
-                // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
-                if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
-                    inp_sfx.erase(inp_sfx.begin());
-                }
-                inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
-                if (add_bos) {
-                    inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
-                }
-                inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
-                embd_inp = inp_pfx;
-                embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                embd_inp.push_back(llama_token_middle(ctx));
-                embd.clear();
-                embd_guidance.clear();
-                n_remain = params.n_predict;
-                n_past = 0;
-                n_consumed = 0;
-                // LOG_TEE("took new input\n");
-                is_interacting = false;
-            }
-            // deal with end of text token in interactive mode
-            else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
-                LOG("found EOS token\n");
-
-                if (params.interactive) {
-
-                    is_interacting = true;
-                    printf("\n");
-                    console::set_display(console::user_input);
-                    fflush(stdout);
-               }
-            }
-
-            if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
-
-                if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
-                    embd_inp.push_back(llama_token_bos(ctx));
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
-                }
-
-                std::string line;
-                bool another_line = true;
-                do {
-                    another_line = console::readline(line, params.multiline_input);
-                    buffer += line;
-                } while (another_line);
-
-                // done taking input, reset color
-                console::set_display(console::reset);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
-                    }
-
-                    LOG("buffer: '%s'\n", buffer.c_str());
-
-                    const size_t original_size = embd_inp.size();
-
-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
-
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
-                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
-                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
-                    }
-
-                    n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
-                } else {
-                    LOG("empty line, passing control back\n");
-                }
-
-                input_echo = false; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                if (is_interacting) {
-                    // reset grammar state if we're restarting generation
-                    if (grammar != NULL) {
-                        llama_grammar_free(grammar);
-
-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-                        grammar = llama_grammar_init(
-                            grammar_rules.data(), grammar_rules.size(),
-                            parsed_grammar.symbol_ids.at("root"));
-                    }
-                }
-                is_interacting = false;
-            }
-        }
-
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
-            break;
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
-        if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-    if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
-        fflush(stdout);
-    }
-
-    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-
-    if (ctx_guidance) { llama_free(ctx_guidance); }
-    llama_free(ctx);
-    llama_free_model(model);
-
-    if (grammar != NULL) {
-        llama_grammar_free(grammar);
-    }
-    llama_backend_free();
-
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
-    return 0;
-}
-
--- a/examples/jeopardy/README.md
+++ b/examples/jeopardy/README.md
@@ -2,7 +2,7 @@

 This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.

-The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.


 Step 1: Open jeopardy.sh and modify the following:
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -132,6 +132,7 @@ struct cmd_params {
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
    std::vector<bool> mul_mat_q;
+    std::vector<bool> low_vram;
    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
    int reps;
    bool verbose;
@@ -148,6 +149,7 @@ static const cmd_params cmd_params_defaults = {
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
    /* mul_mat_q     */ {true},
+    /* low_vram      */ {false},
    /* tensor_split  */ {{}},
    /* reps          */ 5,
    /* verbose       */ false,
@@ -165,8 +167,9 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
@@ -252,6 +255,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                break;
            }
            params.main_gpu = split<int>(argv[i], split_delim);
+        } else if (arg == "-lv" || arg == "--low-vram") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
            if (++i >= argc) {
                invalid_param = true;
@@ -326,6 +336,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
+    if (params.low_vram.empty())     { params.low_vram = cmd_params_defaults.low_vram; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }

@@ -342,34 +353,21 @@ struct cmd_params_instance {
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
+    bool low_vram;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;

-    llama_model_params to_llama_mparams() const {
-        llama_model_params mparams = llama_model_default_params();
+    llama_context_params to_llama_params() const {
+        llama_context_params lparams = llama_context_default_params();
+        lparams.n_ctx = n_prompt + n_gen;
+        lparams.n_batch = n_batch;
+        lparams.f16_kv = !f32_kv;
+        lparams.n_gpu_layers = n_gpu_layers;
+        lparams.main_gpu = main_gpu;
+        lparams.mul_mat_q = mul_mat_q;
+        lparams.low_vram = low_vram;
+        lparams.tensor_split = tensor_split.data();

-        mparams.n_gpu_layers = n_gpu_layers;
-        mparams.main_gpu = main_gpu;
-        mparams.tensor_split = tensor_split.data();
-
-        return mparams;
-    }
-
-    bool equal_mparams(const cmd_params_instance & other) const {
-        return model == other.model &&
-               n_gpu_layers == other.n_gpu_layers &&
-               main_gpu == other.main_gpu &&
-               tensor_split == other.tensor_split;
-    }
-
-    llama_context_params to_llama_cparams() const {
-        llama_context_params cparams = llama_context_default_params();
-
-        cparams.n_ctx = n_prompt + n_gen;
-        cparams.n_batch = n_batch;
-        cparams.f16_kv = !f32_kv;
-        cparams.mul_mat_q = mul_mat_q;
-
-        return cparams;
+        return lparams;
    }
 };

@@ -377,12 +375,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    std::vector<cmd_params_instance> instances;

    for (const auto & m : params.model)
-    for (const auto & nl : params.n_gpu_layers)
-    for (const auto & mg : params.main_gpu)
-    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
    for (const auto & fk : params.f32_kv)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & mg : params.main_gpu)
    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & lv : params.low_vram)
+    for (const auto & ts : params.tensor_split)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
            /* .model        = */ m,
@@ -394,6 +393,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
            /* .mul_mat_q    = */ mmq,
+            /* .low_vram     = */ lv,
            /* .tensor_split = */ ts,
        };
        instances.push_back(instance);
@@ -404,56 +404,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
 static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
    std::vector<cmd_params_instance> instances;

-#if 1
-    // this ordering minimizes the number of times that each model needs to be reloaded
-    for (const auto & m : params.model)
-    for (const auto & nl : params.n_gpu_layers)
-    for (const auto & mg : params.main_gpu)
-    for (const auto & ts : params.tensor_split)
-    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
-    for (const auto & mmq : params.mul_mat_q)
-    for (const auto & nt : params.n_threads) {
-        for (const auto & n_prompt : params.n_prompt) {
-            if (n_prompt == 0) {
-                continue;
-            }
-            cmd_params_instance instance = {
-                /* .model        = */ m,
-                /* .n_prompt     = */ n_prompt,
-                /* .n_gen        = */ 0,
-                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
-                /* .n_threads    = */ nt,
-                /* .n_gpu_layers = */ nl,
-                /* .main_gpu     = */ mg,
-                /* .mul_mat_q    = */ mmq,
-                /* .tensor_split = */ ts,
-            };
-            instances.push_back(instance);
-        }
-
-        for (const auto & n_gen : params.n_gen) {
-            if (n_gen == 0) {
-                continue;
-            }
-            cmd_params_instance instance = {
-                /* .model        = */ m,
-                /* .n_prompt     = */ 0,
-                /* .n_gen        = */ n_gen,
-                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
-                /* .n_threads    = */ nt,
-                /* .n_gpu_layers = */ nl,
-                /* .main_gpu     = */ mg,
-                /* .mul_mat_q    = */ mmq,
-                /* .tensor_split = */ ts,
-            };
-            instances.push_back(instance);
-        }
-    }
-#else
-    // this ordering separates the prompt and generation tests
    for (const auto & n_prompt : params.n_prompt) {
        if (n_prompt == 0) {
            continue;
@@ -469,7 +419,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
    }
-#endif

    return instances;
 }
@@ -494,6 +443,7 @@ struct test {
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
+    bool low_vram;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
    int n_prompt;
    int n_gen;
@@ -513,6 +463,7 @@ struct test {
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
+        low_vram = inst.low_vram;
        tensor_split = inst.tensor_split;
        n_prompt = inst.n_prompt;
        n_gen = inst.n_gen;
@@ -573,7 +524,7 @@ struct test {
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
            "n_batch", "n_threads", "f16_kv",
-            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
+            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@@ -592,7 +543,7 @@ struct test {
            return INT;
        }
        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "mul_mat_q") {
+            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -623,7 +574,7 @@ struct test {
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
-            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
+            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -655,9 +606,9 @@ struct printer {
    virtual ~printer() {}

    FILE * fout;
-    virtual void print_header(const cmd_params & params) { (void) params; }
+    virtual void print_header(const cmd_params & params) { (void) params; };
    virtual void print_test(const test & t) = 0;
-    virtual void print_footer() { }
+    virtual void print_footer() { };
 };

 struct csv_printer : public printer {
@@ -815,6 +766,9 @@ struct markdown_printer : public printer {
        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
            fields.push_back("mul_mat_q");
        }
+        if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
+            fields.push_back("low_vram");
+        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.push_back("tensor_split");
        }
@@ -935,23 +889,17 @@ struct sql_printer : public printer {
 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
    int n_processed = 0;
-
-    llama_set_n_threads(ctx, n_threads, n_threads);
-
    while (n_processed < n_prompt) {
        int n_tokens = std::min(n_prompt - n_processed, n_batch);
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
+        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0), n_threads);
        n_processed += n_tokens;
    }
 }

 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
    llama_token token = llama_token_bos(ctx);
-
-    llama_set_n_threads(ctx, n_threads, n_threads);
-
    for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
+        llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0), n_threads);
    }
 }

@@ -1010,25 +958,17 @@ int main(int argc, char ** argv) {

    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);

-    llama_model * lmodel = nullptr;
-    const cmd_params_instance * prev_inst = nullptr;
-
    for (const auto & inst : params_instances) {
-        // keep the same model between tests when possible
-        if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
-            if (lmodel) {
-                llama_free_model(lmodel);
-            }
+        // TODO: keep the model between tests when possible
+        llama_context_params lparams = inst.to_llama_params();

-            lmodel = llama_load_model_from_file(inst.model.c_str(), inst.to_llama_mparams());
-            if (lmodel == NULL) {
-                fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
-                return 1;
-            }
-            prev_inst = &inst;
+        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), lparams);
+        if (lmodel == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+            return 1;
        }

-        llama_context * ctx = llama_new_context_with_model(lmodel, inst.to_llama_cparams());
+        llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
            llama_free_model(lmodel);
@@ -1066,10 +1006,9 @@ int main(int argc, char ** argv) {
        llama_print_timings(ctx);

        llama_free(ctx);
+        llama_free_model(lmodel);
    }

-    llama_free_model(lmodel);
-
    p->print_footer();

    llama_backend_free();
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -1,20 +0,0 @@
-set(TARGET clip)
-add_library(${TARGET} clip.cpp clip.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-    endif()
-if(TARGET BUILD_INFO)
-    add_dependencies(${TARGET} BUILD_INFO)
-endif()
-
-set(TARGET llava)
-add_executable(${TARGET} llava.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-    add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -1,57 +0,0 @@
-# LLaVA
-
-Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
-
-The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
-and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
-models are available.
-
-After API is confirmed, more models will be supported / uploaded.
-
-## Usage
-Build with cmake or run `make llava` to build it.
-
-After building, run: `./llava` to see the usage. For example:
-
-```sh
-./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
-```
-
-**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
-
-## Model conversion
-
- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
-
-```sh
-git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
-
-git clone https://huggingface.co/openai/clip-vit-large-patch14-336
-```
-
-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
-
-```sh
-python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
-```
-
-3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
-
-```sh
-python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
-```
-
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
-
-```sh
-python ./convert.py ../llava-v1.5-7b
-```
-
-Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
-
-## TODO
-
- [ ] Support server mode.
- [ ] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -1,73 +0,0 @@
-#ifndef CLIP_H
-#define CLIP_H
-
-#include "ggml.h"
-
-struct clip_ctx;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct clip_vision_hparams {
-    int32_t image_size;
-    int32_t patch_size;
-    int32_t hidden_size;
-    int32_t n_intermediate;
-    int32_t projection_dim;
-    int32_t n_head;
-    int32_t n_layer;
-    float eps;
-};
-
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
-
-void clip_free(struct clip_ctx * ctx);
-
-size_t clip_embd_nbytes(struct clip_ctx * ctx);
-int clip_n_patches(struct clip_ctx * ctx);
-int clip_n_mmproj_embd(struct clip_ctx * ctx);
-
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-    uint8_t * data;
-    size_t size;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-    float * data;
-    size_t size;
-};
-
-struct clip_image_u8_batch {
-    struct clip_image_u8 * data;
-    size_t size;
-};
-
-struct clip_image_f32_batch {
-    struct clip_image_f32 * data;
-    size_t size;
-};
-
-struct clip_image_u8 * make_clip_image_u8();
-struct clip_image_f32 * make_clip_image_f32();
-bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
-bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
-
-bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
-                             float * vec);
-
-bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // CLIP_H
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -1,250 +0,0 @@
-import argparse
-import os
-import json
-
-import torch
-import numpy as np
-from gguf import *
-from transformers import CLIPModel, CLIPProcessor
-
-TEXT = "clip.text"
-VISION = "clip.vision"
-
-
-def k(raw_key: str, arch: str) -> str:
-    return raw_key.format(arch=arch)
-
-
-def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: bool) -> bool:
-    if name in (
-        "logit_scale",
-        "text_model.embeddings.position_ids",
-        "vision_model.embeddings.position_ids",
-    ):
-        return True
-
-    if has_llava and name in ["visual_projection.weight", "vision_model.post_layernorm.weight", "vision_model.post_layernorm.bias"]:
-        return True
-
-    if name.startswith("v") and not has_vision:
-        return True
-
-    if name.startswith("t") and not has_text:
-        return True
-
-    return False
-
-
-def get_tensor_name(name: str) -> str:
-    if "projection" in name:
-        return name
-
-    if "mm_projector" in name:
-        return name.replace("model.mm_projector", "mm")
-
-    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
-
-
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-ap = argparse.ArgumentParser(prog="convert_hf_to_gguf.py")
-ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
-ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
-ap.add_argument("--text-only", action="store_true", required=False,
-                help="Save a text-only model. It can't be used to encode images")
-ap.add_argument("--vision-only", action="store_true", required=False,
-                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
-ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
-ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
-ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
-
-args = ap.parse_args()
-
-
-if args.text_only and args.vision_only:
-    print("--text-only and --image-only arguments cannot be specified at the same time.")
-    exit(1)
-
-if args.use_f32:
-    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
-
-# output in the same directory as the model if output_dir is None
-dir_model = args.model_dir
-
-
-with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-    vocab = json.load(f)
-    tokens = [key for key in vocab]
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
-    config = json.load(f)
-    v_hparams = config["vision_config"]
-    t_hparams = config["text_config"]
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-ftype = 1
-if args.use_f32:
-    ftype = 0
-
-
-model = CLIPModel.from_pretrained(dir_model)
-processor = CLIPProcessor.from_pretrained(dir_model)
-
-fname_middle = None
-has_text_encoder = True
-has_vision_encoder = True
-has_llava_projector = False
-if args.text_only:
-    fname_middle = "text-"
-    has_vision_encoder = False
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
-elif args.llava_projector is not None:
-    fname_middle = "mmproj-"
-    has_text_encoder = False
-    has_llava_projector = True
-else:
-    fname_middle = ""
-
-output_dir = args.output_dir if args.output_dir is not None else dir_model
-os.makedirs(output_dir, exist_ok=True)
-output_prefix = os.path.basename(output_dir).replace("ggml_", "")
-fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
-fout = GGUFWriter(path=fname_out, arch="clip")
-
-fout.add_bool("clip.has_text_encoder", has_text_encoder)
-fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
-fout.add_bool("clip.has_llava_projector", has_llava_projector)
-fout.add_file_type(ftype)
-model_name = config["_name_or_path"] if "_name_or_path" in config else os.path.basename(dir_model)
-fout.add_name(model_name)
-if args.text_only:
-    fout.add_description("text-only CLIP model")
-elif args.vision_only and not has_llava_projector:
-    fout.add_description("vision-only CLIP model")
-elif has_llava_projector:
-    fout.add_description("image encoder for LLaVA")
-else:
-    fout.add_description("two-tower CLIP model")
-
-if has_text_encoder:
-    # text_model hparams
-    fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"])
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"])
-    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"])
-    fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"]))
-    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"])
-    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"])
-    fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"])
-    fout.add_token_list(tokens)
-
-if has_vision_encoder:
-    # vision_model hparams
-    fout.add_uint32("clip.vision.image_size", v_hparams["image_size"])
-    fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"])
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"])
-    fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"])
-    fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"]))
-    fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"])
-    fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
-    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
-    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
-
-    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
-    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
-    fout.add_array("clip.vision.image_mean", image_mean)
-    fout.add_array("clip.vision.image_std", image_std)
-
-use_gelu = v_hparams["hidden_act"] == "gelu"
-fout.add_bool("clip.use_gelu", use_gelu)
-
-
-if has_llava_projector:
-    model.vision_model.encoder.layers.pop(-1)
-    projector = torch.load(args.llava_projector)
-    for name, data in projector.items():
-        name = get_tensor_name(name)
-        if data.ndim == 2:
-            data = data.squeeze().numpy().astype(np.float16)
-        else:
-            data = data.squeeze().numpy().astype(np.float32)
-
-        fout.add_tensor(name, data)
-
-    print("Projector tensors added\n")
-
-state_dict = model.state_dict()
-for name, data in state_dict.items():
-    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_llava_projector):
-        # we don't need this
-        print(f"skipping parameter: {name}")
-        continue
-
-    name = get_tensor_name(name)
-    data = data.squeeze().numpy()
-
-    n_dims = len(data.shape)
-
-    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype_cur = 0
-    if n_dims == 4:
-        print(f"tensor {name} is always saved in f16")
-        data = data.astype(np.float16)
-        ftype_cur = 1
-    elif ftype == 1:
-        if name[-7:] == ".weight" and n_dims == 2:
-            print("  Converting to float16")
-            data = data.astype(np.float16)
-            ftype_cur = 1
-        else:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-    else:
-        if data.dtype != np.float32:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-
-    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
-    fout.add_tensor(name, data)
-
-
-fout.write_header_to_file()
-fout.write_kv_data_to_file()
-fout.write_tensors_to_file()
-fout.close()
-
-print("Done. Output file: " + fname_out)
--- a/examples/llava/llava-surgery.py
+++ b/examples/llava/llava-surgery.py
@@ -1,30 +0,0 @@
-import argparse
-import glob
-import os
-import torch
-
-
-ap = argparse.ArgumentParser()
-ap.add_argument("-m", "--model", help="Path to LLaVA v1.5 model")
-args = ap.parse_args()
-
-# find the model part that includes the the multimodal projector weights
-path = sorted(glob.glob(f"{args.model}/pytorch_model*.bin"))[-1]
-checkpoint = torch.load(path)
-
-# get a list of mm tensor names
-mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
-
-# store these tensors in a new dictionary and torch.save them
-projector = {name: checkpoint[name] for name in mm_tensors}
-torch.save(projector, f"{args.model}/llava.projector")
-
-# remove these tensors from the checkpoint and save it again
-for name in mm_tensors:
-    del checkpoint[name]
-
-torch.save(checkpoint, path)
-
-print("Done!")
-print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
-print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -1,145 +0,0 @@
-#pragma once
-
-// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
-
-#include "common.h"
-#include "llama.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = N - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-        if (llama_decode(ctx_llama, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-    return true;
-}
-
-// TODO: use common/sampling.h
-inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
-      // out of user input, sample next token
-    const float   temp      = params.sampling_params.temp;
-    const int32_t top_k     = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
-    const float   top_p     = params.sampling_params.top_p;
-    const float   tfs_z     = params.sampling_params.tfs_z;
-    const float   typical_p = params.sampling_params.typical_p;
-      // const int32_t repeat_last_n   = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
-      // const float   repeat_penalty  = params.sampling_params.repeat_penalty;
-      // const float   alpha_presence  = params.sampling_params.presence_penalty;
-      // const float   alpha_frequency = params.sampling_params.frequency_penalty;
-    const int     mirostat     = params.sampling_params.mirostat;
-    const float   mirostat_tau = params.sampling_params.mirostat_tau;
-    const float   mirostat_eta = params.sampling_params.mirostat_eta;
-      // const bool    penalize_nl     = params.sampling_params.penalize_nl;
-
-    llama_token id = 0;
-    {
-        auto logits  = llama_get_logits(ctx_llama);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
-
-          // Apply params.logit_bias map
-        for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
-            logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-          // TODO: Apply penalties
-          // float nl_logit = logits[llama_token_nl(ctx)];
-          // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-          // llama_sample_repetition_penalty(ctx, &candidates_p,
-          //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-          //      last_n_repeat, repeat_penalty);
-          // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-          // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-          // last_n_repeat, alpha_frequency, alpha_presence);
-          // if (!penalize_nl) {
-          //     logits[llama_token_nl(ctx)] = nl_logit;
-          // }
-
-        if (temp <= 0) {
-              // Greedy sampling
-            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
-        } else {
-            if (mirostat == 1) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                const  int mirostat_m    = 100;
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-            } else if (mirostat == 2) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-            } else {
-                  // Temperature sampling
-                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
-                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
-                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
-                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token(ctx_llama, &candidates_p);
-            }
-        }
-    }
-
-    return id;
-}
-
-inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
-    int id = sample_id(ctx_llama, params);
-    static std::string ret;
-    if (id == llama_token_eos(ctx_llama)) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,164 +0,0 @@
-#include "clip.h"
-#include "llava-utils.h"
-#include "common.h"
-#include "llama.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    if (params.mmproj.empty() || params.image.empty()) {
-        gpt_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    const char * clip_path = params.mmproj.c_str();
-    const char * img_path = params.image.c_str();
-
-    if (params.prompt.empty()) {
-        params.prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-
-    // load and preprocess the image
-    clip_image_u8 img;
-    clip_image_f32 img_res;
-
-    if (!clip_image_load_from_file(img_path, &img)) {
-        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
-
-        clip_free(ctx_clip);
-        return 1;
-    }
-
-    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
-        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
-
-        clip_free(ctx_clip);
-        return 1;
-    }
-
-    int n_img_pos  = clip_n_patches(ctx_clip);
-    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
-
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
-
-    if (!image_embd) {
-        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
-
-        return 1;
-    }
-
-    const int64_t t_img_enc_start_us = ggml_time_us();
-    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
-        fprintf(stderr, "Unable to encode image\n");
-
-        return 1;
-    }
-    const int64_t t_img_enc_end_us = ggml_time_us();
-
-    // we get the embeddings, free up the memory required for CLIP
-    clip_free(ctx_clip);
-
-    llama_backend_init(params.numa);
-
-    llama_model_params model_params              = llama_model_default_params();
-                       model_params.n_gpu_layers = params.n_gpu_layers;
-                       model_params.main_gpu     = params.main_gpu;
-                       model_params.tensor_split = params.tensor_split;
-                       model_params.use_mmap     = params.use_mmap;
-                       model_params.use_mlock    = params.use_mlock;
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    ctx_params.seed            = params.seed;
-
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    // make sure that the correct mmproj was used, i.e., compare apples to apples
-    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
-
-    if (n_img_embd != n_llama_embd) {
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
-
-        llama_free(ctx_llama);
-        llama_free_model(model);
-        llama_backend_free();
-        free(image_embd);
-
-        return 1;
-    }
-
-    // process the prompt
-    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
-
-    int n_past = 0;
-
-    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-
-    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
-    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
-    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
-
-    // generate the response
-
-    printf("\n");
-    printf("prompt: '%s'\n", params.prompt.c_str());
-    printf("\n");
-
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_llama, params, &n_past);
-        if (strcmp(tmp, "</s>") == 0) break;
-
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-
-    printf("\n");
-
-    {
-        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
-
-        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
-    }
-
-    llama_print_timings(ctx_llama);
-
-    llama_free(ctx_llama);
-    llama_free_model(model);
-    llama_backend_free();
-    free(image_embd);
-
-    return 0;
-}
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -28,16 +28,6 @@ configure_file(${_common_path}/../build-info.h
 target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
    ${CMAKE_CURRENT_BINARY_DIR})

-# If the common project was part of "main-cmake-pkg" the transient
-# defines would automatically be attached. Because the common func-
-# tionality is separate, but dependent upon the defines, it must be
-# explicitly extracted from the "llama" target.
-#
-get_target_property(_llama_transient_defines llama
-    INTERFACE_COMPILE_DEFINITIONS)
-
-target_compile_definitions(common PRIVATE "${_llama_transient_defines}")
-
 add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
 target_include_directories(${TARGET} PRIVATE ${_common_path})
 install(TARGETS ${TARGET} RUNTIME)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -262,8 +262,7 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Number of Threads

-   `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. In some systems, it is beneficial to use a higher number of threads during batch processing than during generation. If not specified, the number of threads used for batch processing will be the same as the number of threads used for generation.
+-   `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.

 ### Mlock

@@ -306,5 +305,6 @@ These options provide extra functionality and customization when running the LLa
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -3,6 +3,7 @@
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"

 #include <cassert>
 #include <cinttypes>
@@ -108,7 +109,6 @@ int main(int argc, char ** argv) {
    if (!gpt_params_parse(argc, argv, params)) {
        return 1;
    }
-    llama_sampling_params & sparams = params.sampling_params;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("main", "log"));
@@ -140,17 +140,12 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+    if (params.rope_freq_base != 10000.0) {
+        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
    }

-    if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
-    }
-
-    if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+    if (params.rope_freq_scale != 1.0) {
+        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }

    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -179,7 +174,7 @@ int main(int argc, char ** argv) {
    // load the model and apply lora adapter, if any
    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
+    if (params.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
        ctx_guidance = llama_new_context_with_model(model, lparams);
    }
@@ -189,19 +184,20 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
-
-    if (n_ctx > n_ctx_train) {
+    const int n_ctx_train = llama_n_ctx_train(ctx);
+    if (params.n_ctx > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
    }

    // print system information
    {
        LOG_TEE("\n");
-        LOG_TEE("%s\n", get_system_info(params).c_str());
+        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

    std::string path_session = params.path_prompt_cache;
@@ -215,7 +211,7 @@ int main(int argc, char ** argv) {
        if (fp != NULL) {
            std::fclose(fp);

-            session_tokens.resize(n_ctx);
+            session_tokens.resize(params.n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@@ -230,26 +226,26 @@ int main(int argc, char ** argv) {
        }
    }

-    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
+    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);

    std::vector<llama_token> embd_inp;

    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        LOG("tokenize the prompt\n");
-        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
+        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
    } else {
        LOG("use session tokens\n");
        embd_inp = session_tokens;
    }

    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));

    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
    }

    // Tokenize negative prompt
@@ -257,13 +253,13 @@ int main(int argc, char ** argv) {
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
+        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));

-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
+        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
+        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));

-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
+        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
+        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));

        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
@@ -271,6 +267,9 @@ int main(int argc, char ** argv) {
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

+    const int n_ctx = llama_n_ctx(ctx);
+    LOG("n_ctx: %d\n", n_ctx);
+
    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
@@ -296,9 +295,6 @@ int main(int argc, char ** argv) {
            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
-
-        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
    }

    LOGLN(
@@ -319,11 +315,11 @@ int main(int argc, char ** argv) {
    }

    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false,   true);
+    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
+    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);

-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
+    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
+    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));

    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
@@ -346,7 +342,7 @@ int main(int argc, char ** argv) {

        if (ctx_guidance) {
            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
+            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
@@ -382,12 +378,6 @@ int main(int argc, char ** argv) {
        if (!params.antiprompt.empty()) {
            for (const auto & antiprompt : params.antiprompt) {
                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
-                if (params.verbose_prompt) {
-                    auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
-                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-                    }
-                }
            }
        }

@@ -397,29 +387,46 @@ int main(int argc, char ** argv) {

        if (!params.input_prefix.empty()) {
            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
-            if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
-                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-                }
-            }
        }

        if (!params.input_suffix.empty()) {
            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
-            if (params.verbose_prompt) {
-                auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
-                }
-            }
        }
    }
    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

+    struct llama_grammar * grammar = NULL;
+    grammar_parser::parse_state parsed_grammar;
+
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }
+        LOG_TEE("%s: grammar:\n", __func__);
+        grammar_parser::print_grammar(stderr, parsed_grammar);
+        LOG_TEE("\n");
+
+        {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            }
+        }
+
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar = llama_grammar_init(
+            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    }
+
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_tokens(n_ctx);
+    std::fill(last_tokens.begin(), last_tokens.end(), 0);
+
    if (params.interactive) {
        const char *control_message;
        if (params.multiline_input) {
@@ -459,7 +466,10 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@@ -506,7 +516,7 @@ int main(int argc, char ** argv) {

                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);

-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));

                LOG("clear session path\n");
                path_session.clear();
@@ -536,6 +546,7 @@ int main(int argc, char ** argv) {

            // evaluate tokens in batches
            // embd is typically prepared beforehand to fit within a batch, but not always
+
            if (ctx_guidance) {
                int input_size = 0;
                llama_token * input_buf = NULL;
@@ -557,7 +568,7 @@ int main(int argc, char ** argv) {
                    input_buf  = embd_guidance.data();
                    input_size = embd_guidance.size();

-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
+                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
                } else {
                    input_buf  = embd.data();
                    input_size = embd.size();
@@ -565,7 +576,7 @@ int main(int argc, char ** argv) {

                for (int i = 0; i < input_size; i += params.n_batch) {
                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
+                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0), params.n_threads)) {
                        LOG_TEE("%s : failed to eval\n", __func__);
                        return 1;
                    }
@@ -580,9 +591,9 @@ int main(int argc, char ** argv) {
                    n_eval = params.n_batch;
                }

-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));

-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
+                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0), params.n_threads)) {
                    LOG_TEE("%s : failed to eval\n", __func__);
                    return 1;
                }
@@ -610,11 +621,12 @@ int main(int argc, char ** argv) {
                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);

-            llama_sampling_accept(ctx_sampling, ctx, id);
+            last_tokens.erase(last_tokens.begin());
+            last_tokens.push_back(id);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
+            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));

            embd.push_back(id);

@@ -630,14 +642,8 @@ int main(int argc, char ** argv) {
            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-
-                // GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
-                //     Most likely will remove this in the future to avoid exposing "prev"
-                //     Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
-                //     penalty will be applied only based on the tokens generated by the model.
-                ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-                ctx_sampling->prev.push_back(embd_inp[n_consumed]);
-
+                last_tokens.erase(last_tokens.begin());
+                last_tokens.push_back(embd_inp[n_consumed]);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@@ -660,7 +666,7 @@ int main(int argc, char ** argv) {
            }
            fflush(stdout);
        }
-        // reset color to default if there is no pending user input
+        // reset color to default if we there is no pending user input
        if (input_echo && (int) embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
        }
@@ -670,7 +676,7 @@ int main(int argc, char ** argv) {
            // check for reverse prompt
            if (!params.antiprompt.empty()) {
                std::string last_output;
-                for (auto id : ctx_sampling->prev) {
+                for (auto id : last_tokens) {
                    last_output += llama_token_to_piece(ctx, id);
                }

@@ -687,8 +693,10 @@ int main(int argc, char ** argv) {
                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
+                            console::set_display(console::user_input);
                        }
                        is_antiprompt = true;
+                        fflush(stdout);
                        break;
                    }
                }
@@ -699,19 +707,21 @@ int main(int argc, char ** argv) {
            }

            // deal with end of text token in interactive mode
-            if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
+            if (last_tokens.back() == llama_token_eos(ctx)) {
                LOG("found EOS token\n");

                if (params.interactive) {
                    if (!params.antiprompt.empty()) {
                        // tokenize and inject first reverse prompt
-                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
                        is_antiprompt = true;
                    }

                    is_interacting = true;
                    printf("\n");
+                    console::set_display(console::user_input);
+                    fflush(stdout);
                } else if (params.instruct) {
                    is_interacting = true;
                }
@@ -732,12 +742,10 @@ int main(int argc, char ** argv) {
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    printf("%s", params.input_prefix.c_str());
+                    buffer += params.input_prefix;
+                    printf("%s", buffer.c_str());
                }

-                // color user input only
-                console::set_display(console::user_input);
-
                std::string line;
                bool another_line = true;
                do {
@@ -754,6 +762,7 @@ int main(int argc, char ** argv) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }

@@ -768,14 +777,10 @@ int main(int argc, char ** argv) {
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }

-                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
-                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));

-                    embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-                    embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());

                    // instruct mode: insert response suffix
                    if (params.instruct) {
@@ -800,7 +805,15 @@ int main(int argc, char ** argv) {

            if (n_past > 0) {
                if (is_interacting) {
-                    llama_sampling_reset(ctx_sampling);
+                    // reset grammar state if we're restarting generation
+                    if (grammar != NULL) {
+                        llama_grammar_free(grammar);
+
+                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+                        grammar = llama_grammar_init(
+                            grammar_rules.data(), grammar_rules.size(),
+                            parsed_grammar.symbol_ids.at("root"));
+                    }
                }
                is_interacting = false;
            }
@@ -832,11 +845,13 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

-    llama_sampling_free(ctx_sampling);
+    if (grammar != NULL) {
+        llama_grammar_free(grammar);
+    }
    llama_backend_free();

 #ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
+    LOG_TEE("Log end\n")
 #endif // LOG_DISABLE_LOGS

    return 0;
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -10,7 +10,6 @@
 #include <cstdio>
 #include <string>
 #include <vector>
-#include <ctime>

 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
@@ -51,12 +50,6 @@ static std::vector<std::string> k_prompts = {
 };

 struct client {
-    ~client() {
-        if (ctx_sampling) {
-            llama_sampling_free(ctx_sampling);
-        }
-    }
-
    int32_t id = 0;

    llama_seq_id seq_id = -1;
@@ -74,29 +67,9 @@ struct client {
    std::string prompt;
    std::string response;

-    struct llama_sampling_context * ctx_sampling = nullptr;
+    std::vector<llama_token> tokens_prev;
 };

-static void print_date_time() {
-    std::time_t current_time = std::time(nullptr);
-    std::tm* local_time = std::localtime(&current_time);
-    char buffer[80];
-    strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
-
-    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
-}
-
-// Define a split string function to ...
-static std::vector<std::string> split_string(const std::string& input, char delimiter) {
-    std::vector<std::string> tokens;
-    std::istringstream stream(input);
-    std::string token;
-    while (std::getline(stream, token, delimiter)) {
-        tokens.push_back(token);
-    }
-    return tokens;
-}
-
 int main(int argc, char ** argv) {
    srand(1234);

@@ -131,35 +104,23 @@ int main(int argc, char ** argv) {
    params.logits_all = true;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);

-    // load the prompts from an external file if there are any
-    if (params.prompt.empty()) {
-        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
-    } else {
-        // Output each line of the input params.prompts vector and copy to k_prompts
-        int index = 0;
-        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
-
-        std::vector<std::string> prompts = split_string(params.prompt, '\n');
-        for (const auto& prompt : prompts) {
-            k_prompts.resize(index + 1);
-            k_prompts[index] = prompt;
-            index++;
-            printf("%3d prompt: %s\n", index, prompt.c_str());
-        }
-    }
-
    fprintf(stderr, "\n\n");
    fflush(stderr);

-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_ctx   = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(ctx);

    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.ctx_sampling = llama_sampling_init(params);
+        client.tokens_prev.resize(std::max(256, params.n_predict));
+        std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);
    }

+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
    std::vector<llama_token> tokens_system;
    tokens_system = ::llama_tokenize(ctx, k_system, true);
    const int32_t n_tokens_system = tokens_system.size();
@@ -168,7 +129,7 @@ int main(int argc, char ** argv) {

    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+    llama_batch batch = llama_batch_init(params.n_ctx, 0);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
@@ -183,11 +144,16 @@ int main(int argc, char ** argv) {
    {
        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);

-        for (int32_t i = 0; i < n_tokens_system; ++i) {
-            llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
+        batch.n_tokens = n_tokens_system;
+
+        for (int32_t i = 0; i < batch.n_tokens; ++i) {
+            batch.token[i]  = tokens_system[i];
+            batch.pos[i]    = i;
+            batch.seq_id[i] = 0;
+            batch.logits[i] = false;
        }

-        if (llama_decode(ctx, batch) != 0) {
+        if (llama_decode(ctx, batch, params.n_threads) != 0) {
            LOG_TEE("%s: llama_decode() failed\n", __func__);
            return 1;
        }
@@ -203,7 +169,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("Processing requests ...\n\n");

    while (true) {
-        llama_batch_clear(batch);
+        batch.n_tokens = 0;

        // decode any currently ongoing sequences
        for (auto & client : clients) {
@@ -211,11 +177,15 @@ int main(int argc, char ** argv) {
                continue;
            }

-            client.i_batch = batch.n_tokens;
-
-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+            batch.token [batch.n_tokens] = client.sampled;
+            batch.pos   [batch.n_tokens] = n_tokens_system + client.n_prompt + client.n_decoded;
+            batch.seq_id[batch.n_tokens] = client.id;
+            batch.logits[batch.n_tokens] = true;

            client.n_decoded += 1;
+            client.i_batch = batch.n_tokens;
+
+            batch.n_tokens += 1;
        }

        if (batch.n_tokens == 0) {
@@ -240,14 +210,18 @@ int main(int argc, char ** argv) {
                    client.prompt   = client.input + "\nAssistant:";
                    client.response = "";

-                    llama_sampling_reset(client.ctx_sampling);
+                    std::fill(client.tokens_prev.begin(), client.tokens_prev.end(), 0);

                    // do not prepend BOS because we have a system prompt!
                    std::vector<llama_token> tokens_prompt;
                    tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+                        batch.token [batch.n_tokens] = tokens_prompt[i];
+                        batch.pos   [batch.n_tokens] = i + n_tokens_system;
+                        batch.seq_id[batch.n_tokens] = client.id;
+                        batch.logits[batch.n_tokens] = false;
+                        batch.n_tokens += 1;
                    }

                    // extract the logits only for the last token
@@ -259,7 +233,7 @@ int main(int argc, char ** argv) {
                    client.n_decoded = 0;
                    client.i_batch   = batch.n_tokens - 1;

-                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_TEE("\033[1mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);

                    g_seq_id += 1;

@@ -290,16 +264,15 @@ int main(int argc, char ** argv) {

            llama_batch batch_view = {
                n_tokens,
-                batch.token    + i,
+                batch.token  + i,
                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
+                batch.pos    + i,
+                batch.seq_id + i,
+                batch.logits + i,
                0, 0, 0, // unused
            };

-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = llama_decode(ctx, batch_view, params.n_threads);
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
@@ -328,9 +301,7 @@ int main(int argc, char ** argv) {
                //printf("client %d, seq %d, token %d, pos %d, batch %d\n",
                //        client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);

-                const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
-
-                llama_sampling_accept(client.ctx_sampling, ctx, id);
+                const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i);

                if (client.n_decoded == 1) {
                    // start measuring generation time after the first token to make sure all concurrent clients
@@ -338,8 +309,11 @@ int main(int argc, char ** argv) {
                    client.t_start_gen = ggml_time_us();
                }

-                const std::string token_str = llama_token_to_piece(ctx, id);
+                // remember which tokens were sampled - used for repetition penalties during sampling
+                client.tokens_prev.erase(client.tokens_prev.begin());
+                client.tokens_prev.push_back(id);

+                const std::string token_str = llama_token_to_piece(ctx, id);
                client.response += token_str;
                client.sampled = id;

@@ -358,12 +332,12 @@ int main(int argc, char ** argv) {
                    }

                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
+                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, n_ctx);

                    const auto t_main_end = ggml_time_us();

-                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
-                            client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
+                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\nResponse: %s\n\n",
+                            client.id, client.seq_id, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
                            n_cache_miss,
@@ -383,21 +357,13 @@ int main(int argc, char ** argv) {

    const auto t_main_end = ggml_time_us();

-    print_date_time();
-
-    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    if (params.prompt_file.empty()) {
-        params.prompt_file = "used built-in defaults";
-    }
-    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
-
+    LOG_TEE("\n\n");
    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);

-    LOG_TEE("\n");
+    LOG_TEE("\n\n");

    llama_print_timings(ctx);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -150,18 +150,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

-    const int n_ctx = llama_n_ctx(ctx);
-
-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*params.n_ctx) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
+                params.n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@@ -177,20 +175,20 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        return {tokens, -1, logit_history, prob_history};
    }

-    const int calc_chunk = n_ctx;
+    const int calc_chunk = params.n_ctx;

    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
-                tokens.size(), n_ctx, params.ppl_stride);
+                tokens.size(), params.n_ctx, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = llama_n_vocab(ctx);
    const int n_batch = params.n_batch;

    int count = 0;
@@ -217,7 +215,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            const int batch_size  = std::min(end - batch_start, n_batch);

            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
                //fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }
@@ -252,7 +250,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        }

        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
-        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
+        for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
            const std::vector<float> tok_logits(
@@ -289,9 +287,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

-    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;
-    const int n_ctx = llama_n_ctx(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@@ -301,9 +298,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*params.n_ctx) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
+                params.n_ctx);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@@ -314,10 +311,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<float> prob_history;
    prob_history.resize(tokens.size());

-    const int n_chunk_max = tokens.size() / n_ctx;
+    const int n_chunk_max = tokens.size() / params.n_ctx;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = llama_n_vocab(ctx);
    const int n_batch = params.n_batch;

    int count = 0;
@@ -329,10 +326,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start =     i * params.n_ctx;
+        const int end   = start + params.n_ctx;

-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;

        std::vector<float> logits;

@@ -353,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
                tokens[batch_start] = llama_token_bos(ctx);
            }

-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0), params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }
@@ -361,7 +358,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

-            const auto * batch_logits = llama_get_logits(ctx);
+            const auto batch_logits = llama_get_logits(ctx);
            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }

@@ -390,10 +387,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+        const int first = params.n_ctx/2;
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += n_ctx - first - 1;
+        count += params.n_ctx - first - 1;

        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
@@ -402,7 +399,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            double av = nll/count;
            double av2 = nll2/count - av*av;
            if (av2 > 0) av2 = sqrt(av2/(count-1));
-            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
        }
        fflush(stdout);
    }
@@ -423,7 +420,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 }

 static std::vector<float> hellaswag_evaluate_tokens(
-    llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab
+    llama_context * ctx, std::vector<int> & tokens, int n_past, int n_batch, int n_vocab, int n_thread
 ) {
    std::vector<float> result;
    result.reserve(tokens.size() * n_vocab);
@@ -431,7 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
        size_t n_tokens = tokens.size() - i_chunk * n_batch;
        n_tokens = std::min(n_tokens, size_t(n_batch));
-        if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
+        if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0), n_thread)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return {};
        }
@@ -478,7 +475,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    size_t hs_task_count = prompt_lines.size()/6;
    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);

-    const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
+    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    fprintf(stderr, "================================= is_spm = %d\n", is_spm);

    // This is needed as usual for LLaMA models
@@ -533,8 +530,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    printf("\ntask\tacc_norm\n");

    double acc = 0.0f;
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const int n_vocab = llama_n_vocab(ctx);

    std::vector<std::vector<int>> ending_tokens(4);

@@ -562,7 +558,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        auto query_size = query_embd.size();

        // Stop if query wont fit the ctx window
-        if (query_size > (size_t)n_ctx) {
+        if (query_size > (size_t)params.n_ctx) {
            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
            return;
        }
@@ -575,7 +571,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        // clear the KV cache
        llama_kv_cache_tokens_rm(ctx, -1, -1);

-        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
        if (logits.empty()) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return;
@@ -612,7 +608,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            query_size = query_embd.size();

            // Stop if query wont fit the ctx window
-            if (context_size + query_size > (size_t)n_ctx) {
+            if (context_size + query_size > (size_t)params.n_ctx) {
                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
                return;
            }
@@ -624,7 +620,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
            //}

            // Evaluate the query
-            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab);
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
            if (logits.empty()) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
@@ -720,7 +716,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = llama_n_ctx_train(ctx);
    if (params.n_ctx > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
@@ -729,7 +725,8 @@ int main(int argc, char ** argv) {
    // print system information
    {
        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

    struct results_perplexity results;
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -309,22 +309,21 @@ int main(int argc, char ** argv) {
    llama_context * ctx;

    {
-        auto mparams = llama_model_default_params();
-        mparams.use_mlock  = false;
+        auto lparams = llama_context_default_params();

-        model = llama_load_model_from_file(params.model.c_str(), mparams);
+        lparams.n_ctx      = 256;
+        lparams.seed       = 1;
+        lparams.f16_kv     = false;
+        lparams.use_mlock  = false;
+
+        model = llama_load_model_from_file(params.model.c_str(), lparams);

        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
            return 1;
        }

-        auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
-        cparams.seed       = 1;
-        cparams.f16_kv     = false;
-
-        ctx = llama_new_context_with_model(model, cparams);
+        ctx = llama_new_context_with_model(model, lparams);

        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -72,7 +72,6 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 // usage:
 //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
-[[noreturn]]
 static void usage(const char * executable) {
    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -8,7 +8,9 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-
+    params.seed = 42;
+    params.n_threads = 4;
+    params.repeat_last_n = 64;
    params.prompt = "The quick brown fox";

    if (!gpt_params_parse(argc, argv, params)) {
@@ -21,50 +23,63 @@ int main(int argc, char ** argv) {
        params.n_predict = 16;
    }

-    auto n_past = 0;
+    auto lparams = llama_context_default_params();

-    std::string result0;
-    std::string result1;
+    lparams.n_ctx     = params.n_ctx;
+    lparams.seed      = params.seed;
+    lparams.f16_kv    = params.memory_f16;
+    lparams.use_mmap  = params.use_mmap;
+    lparams.use_mlock = params.use_mlock;
+
+    auto n_past = 0;
+    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);

    // init
-    llama_model * model;
-    llama_context * ctx;
-
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+    auto * model = llama_load_model_from_file(params.model.c_str(), lparams);
+    if (model == nullptr) {
+        return 1;
+    }
+    auto * ctx = llama_new_context_with_model(model, lparams);
+    if (ctx == nullptr) {
+        llama_free_model(model);
+        return 1;
+    }
+    auto tokens = llama_tokenize(ctx, params.prompt, true);
+    auto n_prompt_tokens = tokens.size();
+    if (n_prompt_tokens < 1) {
+        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+        llama_free(ctx);
+        llama_free_model(model);
        return 1;
    }

-    // tokenize prompt
-    auto tokens = llama_tokenize(ctx, params.prompt, true);
-
    // evaluate prompt
-    llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
-    n_past += tokens.size();
+    llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt_tokens, n_past, 0), params.n_threads);

-    // save state (rng, logits, embedding and kv_cache) to file
+    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
+    n_past += n_prompt_tokens;
+
+    const size_t state_size = llama_get_state_size(ctx);
+    uint8_t * state_mem = new uint8_t[state_size];
+
+    // Save state (rng, logits, embedding and kv_cache) to file
    {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx));
-
-        {
-            FILE *fp_write = fopen("dump_state.bin", "wb");
-            llama_copy_state_data(ctx, state_mem.data()); // could also copy directly to memory mapped file
-            fwrite(state_mem.data(), 1, state_mem.size(), fp_write);
-            fclose(fp_write);
-        }
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+        fwrite(state_mem, 1, state_size, fp_write);
+        fclose(fp_write);
    }

    // save state (last tokens)
+    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
    const auto n_past_saved = n_past;

    // first run
-    printf("\nfirst run: %s", params.prompt.c_str());
+    printf("\n%s", params.prompt.c_str());

    for (auto i = 0; i < params.n_predict; i++) {
        auto * logits = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab(model);
-
+        auto n_vocab = llama_n_vocab(ctx);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -73,11 +88,10 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx, next_token);
+        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
-        result0 += next_token_str;
-
-        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
            llama_free_model(model);
@@ -92,36 +106,40 @@ int main(int argc, char ** argv) {
    llama_free(ctx);

    // make new context
-    auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
+    auto * ctx2 = llama_new_context_with_model(model, lparams);

-    printf("\nsecond run: %s", params.prompt.c_str());
-
-    // load state (rng, logits, embedding and kv_cache) from file
+    // Load state (rng, logits, embedding and kv_cache) from file
    {
-        std::vector<uint8_t> state_mem(llama_get_state_size(ctx2));
+        FILE *fp_read = fopen("dump_state.bin", "rb");
+        if (state_size != llama_get_state_size(ctx2)) {
+            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+            llama_free(ctx2);
+            llama_free_model(model);
+            return 1;
+        }

-        FILE * fp_read = fopen("dump_state.bin", "rb");
-
-        const size_t ret = fread(state_mem.data(), 1, state_mem.size(), fp_read);
-        if (ret != state_mem.size()) {
+        const size_t ret = fread(state_mem, 1, state_size, fp_read);
+        if (ret != state_size) {
            fprintf(stderr, "\n%s : failed to read state\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
            return 1;
        }

-        llama_set_state_data(ctx2, state_mem.data());
-
+        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
        fclose(fp_read);
    }

+    delete[] state_mem;
+
    // restore state (last tokens)
+    last_n_tokens_data = last_n_tokens_data_saved;
    n_past = n_past_saved;

    // second run
    for (auto i = 0; i < params.n_predict; i++) {
        auto * logits = llama_get_logits(ctx2);
-        auto n_vocab = llama_n_vocab(model);
+        auto n_vocab = llama_n_vocab(ctx2);
        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);
        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -130,11 +148,10 @@ int main(int argc, char ** argv) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
        auto next_token = llama_sample_token(ctx2, &candidates_p);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);
+        last_n_tokens_data.push_back(next_token);

        printf("%s", next_token_str.c_str());
-        result1 += next_token_str;
-
-        if (llama_decode(ctx2, llama_batch_get_one(&next_token, 1, n_past, 0))) {
+        if (llama_decode(ctx, llama_batch_get_one(&next_token, 1, n_past, 0), params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
            llama_free_model(model);
@@ -143,17 +160,10 @@ int main(int argc, char ** argv) {
        n_past += 1;
    }

-    printf("\n");
+    printf("\n\n");

    llama_free(ctx2);
    llama_free_model(model);

-    if (result0 != result1) {
-        fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__);
-        return 1;
-    }
-
-    fprintf(stderr, "\n%s : success\n", __func__);
-
    return 0;
 }
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -4,14 +4,14 @@ This example demonstrates a simple HTTP API server and a simple web front end to

 Command line options:

-   `--threads N`, `-t N`: Set the number of threads to use during generation.
-   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
+-   `--threads N`, `-t N`: Set the number of threads to use during computation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
@@ -106,25 +106,25 @@ node index.js

 ## API Endpoints

-   **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
+-   **POST** `/completion`: Given a prompt, it returns the predicted completion.

    *Options:*

-    `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does.
-
    `temperature`: Adjust the randomness of the generated text (default: 0.8).

    `top_k`: Limit the next token selection to the K most probable tokens (default: 40).

-    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
+    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).

-    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
+    `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).

-    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
-    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt.
+    `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
+    By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

+    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
+
    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).

@@ -156,38 +156,6 @@ node index.js

    `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).

-    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
-
-    *Result JSON:*
-
-    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
-
-    `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
-
-    `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
-
-    `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
-
-    `model`: The path to the model loaded with `-m`
-
-    `prompt`: The provided `prompt`
-
-    `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
-
-    `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-
-    `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
-
-    `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
-
-    `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
-
-    `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
-
-    `tokens_evaluated`: Number of tokens evaluated in total from the prompt
-
-    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
-
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
@@ -208,16 +176,6 @@ node index.js

    `content`: Set the text to process.

-    **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
-
-    *Options:*
-
-    `input_prefix`: Set the prefix of the code to infill.
-
-    `input_suffix`: Set the suffix of the code to infill.
-
-    It also accepts all the options of `/completion` except `stream` and `prompt`.
-
 ## More examples

 ### Interactive mode
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -27,10 +27,10 @@ def is_present(json, key):
        buf = json[key]
    except KeyError:
        return False
-    if json[key] == None:
-        return False
    return True

+
+
 #convert chat to prompt
 def convert_chat(messages):
    prompt = "" + args.chat_prompt.replace("\\n", "\n")
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -136,11 +136,6 @@
      display: block;
    }

-    fieldset label.slim {
-      margin: 0 0.5em;
-      display: inline;
-    }
-
    header, footer {
      text-align: center;
    }
@@ -150,14 +145,6 @@
      color: #888;
    }

-    .mode-chat textarea[name=prompt] {
-      height: 4.5em;
-    }
-
-    .mode-completion textarea[name=prompt] {
-      height: 10em;
-    }
-

    @keyframes loading-bg-wipe {
      0% {
@@ -200,7 +187,7 @@
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
-      type: "chat",  // "chat" | "completion"
+      type: "chat",
      char: "Llama",
      user: "User",
    })
@@ -378,44 +365,13 @@
      return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
    }

-    async function runLlama(prompt, llamaParams, char) {
-      const currentMessages = [];
-      const history = session.value.transcript;
-      if (controller.value) {
-        throw new Error("already running");
-      }
-      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
-        const data = chunk.data;
-
-        if (data.stop) {
-          while (
-            currentMessages.length > 0 &&
-            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
-            ) {
-            currentMessages.pop();
-          }
-          transcriptUpdate([...history, [char, currentMessages]])
-          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
-        } else {
-          currentMessages.push(data);
-          transcriptUpdate([...history, [char, currentMessages]])
-        }
-
-        if (data.timings) {
-          llamaStats.value = data.timings;
-        }
-      }
-
-      controller.value = null;
-    }
-
    // send message to server
    const chat = async (msg) => {
      if (controller.value) {
        console.log('already running...');
        return;
      }
+      controller.value = new AbortController();

      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])

@@ -435,41 +391,55 @@
        ).join("\n"),
      });

-      await runLlama(prompt, {
+      const currentMessages = [];
+      const history = session.value.transcript
+
+      const llamaParams = {
        ...params.value,
        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
-      }, "{{char}}");
-    }
-
-    const runCompletion = async () => {
-      if (controller.value) {
-        console.log('already running...');
-        return;
      }
-      const {prompt} = session.value;
-      transcriptUpdate([...session.value.transcript, ["", prompt]]);
-      await runLlama(prompt, {
-        ...params.value,
-        stop: [],
-      }, "");
-    }

-    const stop = (e) => {
-      e.preventDefault();
-      if (controller.value) {
-        controller.value.abort();
-        controller.value = null;
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
+        const data = chunk.data;
+
+        if (data.stop) {
+          while (
+            currentMessages.length > 0 &&
+            currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
+          ) {
+            currentMessages.pop();
+          }
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
+          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
+        } else {
+          currentMessages.push(data);
+          transcriptUpdate([...history, ["{{char}}", currentMessages]])
+        }
+
+        if (data.timings) {
+          llamaStats.value = data.timings;
+        }
      }
-    }

-    const reset = (e) => {
-      stop(e);
-      transcriptUpdate([]);
+      controller.value = null;
    }

    function MessageInput() {
      const message = useSignal("")

+      const stop = (e) => {
+        e.preventDefault();
+        if (controller.value) {
+          controller.value.abort();
+          controller.value = null;
+        }
+      }
+
+      const reset = (e) => {
+        stop(e);
+        transcriptUpdate([]);
+      }
+
      const submit = (e) => {
        stop(e);
        chat(message.value);
@@ -504,19 +474,6 @@
      `
    }

-    function CompletionControls() {
-      const submit = (e) => {
-        stop(e);
-        runCompletion();
-      }
-      return html`
-        <div>
-          <button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
-          <button onclick=${stop} disabled=${!generating.value}>Stop</button>
-          <button onclick=${reset}>Reset</button>
-        </div>`;
-    }
-
    const ChatLog = (props) => {
      const messages = session.value.transcript;
      const container = useRef(null)
@@ -540,11 +497,7 @@
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
-        if(user) {
-          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
-        } else {
-          return html`<p key=${index}>${message}</p>`
-        }
+        return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
      };

      return html`
@@ -621,31 +574,18 @@
        userTemplateAutosave()
      }, [session.value, params.value])

-      const GrammarControl = () => (
-        html`
-          <div>
-            <label for="template">Grammar</label>
-            <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
-            <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
-            <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
-          </div>
-          `
-      );
+      return html`
+        <form>
+          <fieldset>
+            <${UserTemplateResetButton}/>
+          </fieldset>

-      const PromptControlFieldSet = () => (
-        html`
-        <fieldset>
-          <div>
-            <label htmlFor="prompt">Prompt</label>
-            <textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
-          </div>
-        </fieldset>
-        `
-      );
-
-      const ChatConfigForm = () => (
-        html`
-          ${PromptControlFieldSet()}
+          <fieldset>
+            <div>
+              <label for="prompt">Prompt</label>
+              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
+            </div>
+          </fieldset>

          <fieldset class="two">
            <div>
@@ -669,30 +609,15 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
-            ${GrammarControl()}
-          </fieldset>
-      `
-    );

-      const CompletionConfigForm = () => (
-        html`
-          ${PromptControlFieldSet()}
-          <fieldset>${GrammarControl()}</fieldset>
-        `
-      );
-
-      return html`
-        <form>
-          <fieldset class="two">
-            <${UserTemplateResetButton}/>
            <div>
-              <label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
-              <label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
+              <label for="template">Grammar</label>
+              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
            </div>
          </fieldset>

-          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
-
          <fieldset class="two">
            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
@@ -926,7 +851,7 @@
    function App(props) {

      return html`
-        <div class="mode-${session.value.type}">
+        <div>
          <header>
            <h1>llama.cpp</h1>
          </header>
@@ -936,7 +861,7 @@
          </main>

          <section id="write">
-            <${session.value.type === 'chat' ? MessageInput : CompletionControls} />
+            <${MessageInput} />
          </section>

          <footer>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -194,12 +195,14 @@ struct llama_server_context

    json prompt;
    std::vector<llama_token> embd;
+    std::vector<llama_token> last_n_tokens;

    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    gpt_params params;
-    llama_sampling_context *ctx_sampling;
-    int n_ctx;
+
+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *grammar = nullptr;

    bool truncated = false;
    bool stopped_eos = false;
@@ -236,7 +239,7 @@ struct llama_server_context
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
-        generated_text.reserve(n_ctx);
+        generated_text.reserve(params.n_ctx);
        generated_token_probs.clear();
        truncated = false;
        stopped_eos = false;
@@ -247,10 +250,10 @@ struct llama_server_context
        n_remain = 0;
        n_past = 0;

-        if (ctx_sampling != nullptr) {
-            llama_sampling_free(ctx_sampling);
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
        }
-        ctx_sampling = llama_sampling_init(params);
    }

    bool loadModel(const gpt_params &params_)
@@ -262,7 +265,9 @@ struct llama_server_context
            LOG_ERROR("unable to load model", {{"model", params_.model}});
            return false;
        }
-        n_ctx = llama_n_ctx(ctx);
+
+        last_n_tokens.resize(params.n_ctx);
+        std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
        return true;
    }

@@ -313,30 +318,32 @@ struct llama_server_context

    bool loadGrammar()
    {
-        ctx_sampling = llama_sampling_init(params);
+        if (!params.grammar.empty()) {
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
+                }
+            }
+
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
        return true;
    }

-    void loadInfill()
+    void loadPrompt()
    {
-        bool suff_rm_leading_spc = true;
-        if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
-            params.input_suffix.erase(0, 1);
-            suff_rm_leading_spc = false;
-        }
-
-        auto prefix_tokens = tokenize(params.input_prefix, false);
-        auto suffix_tokens = tokenize(params.input_suffix, false);
-        const int space_token = 29871;
-        if (suff_rm_leading_spc  && suffix_tokens[0] == space_token) {
-            suffix_tokens.erase(suffix_tokens.begin());
-        }
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
-        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
-        prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
-        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-        prefix_tokens.push_back(llama_token_middle(ctx));
-        auto prompt_tokens = prefix_tokens;
+        auto prompt_tokens = tokenize(prompt, true);  // always add BOS

        num_prompt_tokens = prompt_tokens.size();

@@ -349,13 +356,11 @@ struct llama_server_context
        // if input prompt is too big, truncate like normal
        if (num_prompt_tokens >= (size_t)params.n_ctx)
        {
-            printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
-            // todo we probably want to cut from both sides
            const int n_left = (params.n_ctx - params.n_keep) / 2;
            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
+            std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());

            LOG_VERBOSE("input truncated", {
                                               {"n_ctx", params.n_ctx},
@@ -370,72 +375,15 @@ struct llama_server_context
        else
        {
            const size_t ps = num_prompt_tokens;
-            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
+            std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
+            std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
        }

        // compare the evaluated prompt with the new prompt
        n_past = common_part(embd, prompt_tokens);
-        embd = prompt_tokens;
-
-        if (n_past == num_prompt_tokens)
-        {
-            // we have to evaluate at least 1 token to generate logits.
-            printf("we have to evaluate at least 1 token to generate logits\n");
-            n_past--;
-        }

        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-        LOG_VERBOSE("prompt ingested", {
-                                           {"n_past", n_past},
-                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
-                                           {"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
-                                       });
-
-        has_next_token = true;
-    }
-    void loadPrompt()
-    {
-        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
-
-        num_prompt_tokens = prompt_tokens.size();
-
-        if (params.n_keep < 0)
-        {
-            params.n_keep = (int)num_prompt_tokens;
-        }
-        params.n_keep = std::min(n_ctx - 4, params.n_keep);
-
-        // if input prompt is too big, truncate like normal
-        if (num_prompt_tokens >= (size_t)n_ctx)
-        {
-            const int n_left = (n_ctx - params.n_keep) / 2;
-            std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
-            const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
-            new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
-            std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), ctx_sampling->prev.begin());
-
-            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", n_ctx},
-                                               {"n_keep", params.n_keep},
-                                               {"n_left", n_left},
-                                               {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
-                                           });
-
-            truncated = true;
-            prompt_tokens = new_tokens;
-        }
-        else
-        {
-            const size_t ps = num_prompt_tokens;
-            std::fill(ctx_sampling->prev.begin(), ctx_sampling->prev.end() - ps, 0);
-            std::copy(prompt_tokens.begin(), prompt_tokens.end(), ctx_sampling->prev.end() - ps);
-        }
-
-        // compare the evaluated prompt with the new prompt
-        n_past = common_part(embd, prompt_tokens);
+        llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);

        embd = prompt_tokens;
        if (n_past == num_prompt_tokens)
@@ -444,9 +392,6 @@ struct llama_server_context
            n_past--;
        }

-        // since #3228 we now have to manually manage the KV cache
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
        LOG_VERBOSE("prompt ingested", {
                                           {"n_past", n_past},
                                           {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -468,7 +413,7 @@ struct llama_server_context
        completion_token_output result;
        result.tok = -1;

-        if (embd.size() >= (size_t)n_ctx)
+        if (embd.size() >= (size_t)params.n_ctx)
        {
            // Shift context

@@ -488,27 +433,26 @@ struct llama_server_context

            truncated = true;
            LOG_VERBOSE("input truncated", {
-                                               {"n_ctx", n_ctx},
+                                               {"n_ctx", params.n_ctx},
                                               {"n_keep", params.n_keep},
                                               {"n_left", n_left},
                                           });
        }

-        bool tg = true;
        while (n_past < embd.size())
        {
            int n_eval = (int)embd.size() - n_past;
-            tg = n_eval == 1;
            if (n_eval > params.n_batch)
            {
                n_eval = params.n_batch;
            }

-            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
+            if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
            {
                LOG_ERROR("failed to eval", {
                                                {"n_eval", n_eval},
                                                {"n_past", n_past},
+                                                {"n_threads", params.n_threads},
                                                {"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
                                            });
                has_next_token = false;
@@ -524,29 +468,108 @@ struct llama_server_context
            return result;
        }

+        // out of user input, sample next token
+        const float temp = params.temp;
+        const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+        const float top_p = params.top_p;
+        const float tfs_z = params.tfs_z;
+        const float typical_p = params.typical_p;
+        const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
+        const float repeat_penalty = params.repeat_penalty;
+        const float alpha_presence = params.presence_penalty;
+        const float alpha_frequency = params.frequency_penalty;
+        const int mirostat = params.mirostat;
+        const float mirostat_tau = params.mirostat_tau;
+        const float mirostat_eta = params.mirostat_eta;
+        const bool penalize_nl = params.penalize_nl;
+        const int32_t n_probs = params.n_probs;
+
        {
-            // out of user input, sample next token
-            result.tok = llama_sampling_sample(ctx_sampling, ctx, NULL);
+            auto *logits = llama_get_logits(ctx);
+            auto n_vocab = llama_n_vocab(ctx);

-            llama_token_data_array cur_p = { ctx_sampling->cur.data(), ctx_sampling->cur.size(), false };
-
-            const int32_t n_probs = params.sampling_params.n_probs;
-            if (params.sampling_params.temp <= 0 && n_probs > 0)
+            // Apply params.logit_bias map
+            for (const auto &it : params.logit_bias)
            {
-                // For llama_sample_token_greedy we need to sort candidates
-                llama_sample_softmax(ctx, &cur_p);
+                logits[it.first] += it.second;
            }

-            for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+            std::vector<llama_token_data> candidates;
+            candidates.reserve(n_vocab);
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++)
            {
-                result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
+                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
            }

-            llama_sampling_accept(ctx_sampling, ctx, result.tok);
+            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};

-            if (tg) {
-                num_tokens_predicted++;
+            // Apply penalties
+            float nl_logit = logits[llama_token_nl(ctx)];
+            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
+            llama_sample_repetition_penalty(ctx, &candidates_p,
+                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                            last_n_repeat, repeat_penalty);
+            llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                                                          last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                                          last_n_repeat, alpha_frequency, alpha_presence);
+            if (!penalize_nl)
+            {
+                logits[llama_token_nl(ctx)] = nl_logit;
            }
+
+            if (grammar != nullptr) {
+                llama_sample_grammar(ctx, &candidates_p, grammar);
+            }
+
+            if (temp <= 0)
+            {
+                // Greedy sampling
+                result.tok = llama_sample_token_greedy(ctx, &candidates_p);
+                if (n_probs > 0)
+                {
+                    llama_sample_softmax(ctx, &candidates_p);
+                }
+            }
+            else
+            {
+                if (mirostat == 1)
+                {
+                    static float mirostat_mu = 2.0f * mirostat_tau;
+                    const int mirostat_m = 100;
+                    llama_sample_temp(ctx, &candidates_p, temp);
+                    result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                }
+                else if (mirostat == 2)
+                {
+                    static float mirostat_mu = 2.0f * mirostat_tau;
+                    llama_sample_temp(ctx, &candidates_p, temp);
+                    result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                }
+                else
+                {
+                    // Temperature sampling
+                    size_t min_keep = std::max(1, n_probs);
+                    llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
+                    llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
+                    llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
+                    llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
+                    llama_sample_temp(ctx, &candidates_p, temp);
+                    result.tok = llama_sample_token(ctx, &candidates_p);
+                }
+            }
+
+            if (grammar != nullptr) {
+                llama_grammar_accept_token(ctx, grammar, result.tok);
+            }
+
+            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
+            {
+                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
+            }
+
+            last_n_tokens.erase(last_n_tokens.begin());
+            last_n_tokens.push_back(result.tok);
+            num_tokens_predicted++;
        }

        // add it to the context
@@ -606,7 +629,7 @@ struct llama_server_context
        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;

-        if (params.sampling_params.n_probs > 0)
+        if (params.n_probs > 0)
        {
            generated_token_probs.push_back(token_with_probs);
        }
@@ -667,7 +690,7 @@ struct llama_server_context

    std::vector<float> getEmbedding()
    {
-        static const int n_embd = llama_n_embd(model);
+        static const int n_embd = llama_n_embd(ctx);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {
@@ -687,16 +710,15 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("usage: %s [options]\n", argv0);
    printf("\n");
    printf("options:\n");
-    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  -h, --help            show this help message and exit\n");
+    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
+    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -712,6 +734,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -ts SPLIT --tensor-split SPLIT\n");
    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
    printf("  -nommq, --no-mul-mat-q\n");
    printf("                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
@@ -841,15 +864,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_threads = std::stoi(argv[i]);
        }
-        else if (arg == "--threads-batch" || arg == "-tb")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-        }
        else if (arg == "-b" || arg == "--batch-size")
        {
            if (++i >= argc)
@@ -904,6 +918,14 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
 #else
            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
+#endif // GGML_USE_CUBLAS
+        }
+        else if (arg == "--low-vram" || arg == "-lv")
+        {
+#ifdef GGML_USE_CUBLAS
+            params.low_vram = true;
+#else
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
@@ -934,23 +956,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
-            params.use_mmap = false;
-        }
-        else if (arg == "--lora-scaled")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
+            params.lora_adapter = argv[i];
            params.use_mmap = false;
        }
        else if (arg == "--lora-base")
@@ -1004,35 +1010,34 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,

 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto & sparams = llama.params.sampling_params;
-    const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
-    const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
+    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);

    return json{
-        {"n_ctx", llama.n_ctx},
+        {"n_ctx", llama.params.n_ctx},
        {"model", llama.params.model_alias},
        {"seed", llama.params.seed},
-        {"temp", sparams.temp},
-        {"top_k", sparams.top_k},
-        {"top_p", sparams.top_p},
-        {"tfs_z", sparams.tfs_z},
-        {"typical_p", sparams.typical_p},
-        {"repeat_last_n", sparams.repeat_last_n},
-        {"repeat_penalty", sparams.repeat_penalty},
-        {"presence_penalty", sparams.presence_penalty},
-        {"frequency_penalty", sparams.frequency_penalty},
-        {"mirostat", sparams.mirostat},
-        {"mirostat_tau", sparams.mirostat_tau},
-        {"mirostat_eta", sparams.mirostat_eta},
-        {"penalize_nl", sparams.penalize_nl},
+        {"temp", llama.params.temp},
+        {"top_k", llama.params.top_k},
+        {"top_p", llama.params.top_p},
+        {"tfs_z", llama.params.tfs_z},
+        {"typical_p", llama.params.typical_p},
+        {"repeat_last_n", llama.params.repeat_last_n},
+        {"repeat_penalty", llama.params.repeat_penalty},
+        {"presence_penalty", llama.params.presence_penalty},
+        {"frequency_penalty", llama.params.frequency_penalty},
+        {"mirostat", llama.params.mirostat},
+        {"mirostat_tau", llama.params.mirostat_tau},
+        {"mirostat_eta", llama.params.mirostat_eta},
+        {"penalize_nl", llama.params.penalize_nl},
        {"stop", llama.params.antiprompt},
        {"n_predict", llama.params.n_predict},
        {"n_keep", llama.params.n_keep},
        {"ignore_eos", ignore_eos},
        {"stream", llama.stream},
-        {"logit_bias", sparams.logit_bias},
-        {"n_probs", sparams.n_probs},
+        {"logit_bias", llama.params.logit_bias},
+        {"n_probs", llama.params.n_probs},
        {"grammar", llama.params.grammar},
    };
 }
@@ -1048,6 +1053,8 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);

+    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
+
    return json{
        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
@@ -1081,7 +1088,7 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"timings", format_timings(llama)},
    };

-    if (llama.params.sampling_params.n_probs > 0)
+    if (llama.params.n_probs > 0)
    {
        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
    }
@@ -1097,7 +1104,7 @@ static json format_partial_response(
        {"stop", false},
    };

-    if (llama.params.sampling_params.n_probs > 0)
+    if (llama.params.n_probs > 0)
    {
        res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
    }
@@ -1129,28 +1136,26 @@ static T json_value(const json &body, const std::string &key, const T &default_v
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;
-    const auto & default_sparams = default_params.sampling_params;
-    auto & sparams = llama.params.sampling_params;

    llama.stream = json_value(body, "stream", false);
    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
-    sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
-    sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
-    sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
-    sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
-    sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
-    sparams.temp = json_value(body, "temperature", default_sparams.temp);
-    sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
-    sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
-    sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
-    sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
-    sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
-    sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
-    sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
+    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
+    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
+    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
+    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
+    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
+    llama.params.temp = json_value(body, "temperature", default_params.temp);
+    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
+    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
+    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
+    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
+    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
+    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
    llama.params.seed = json_value(body, "seed", default_params.seed);
    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
-    sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
+    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);

    if (body.count("prompt") != 0)
    {
@@ -1161,16 +1166,16 @@ static void parse_options_completion(const json &body, llama_server_context &lla
        llama.prompt = "";
    }

-    sparams.logit_bias.clear();
+    llama.params.logit_bias.clear();
    if (json_value(body, "ignore_eos", false))
    {
-        sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }

    const auto &logit_bias = body.find("logit_bias");
    if (logit_bias != body.end() && logit_bias->is_array())
    {
-        const int n_vocab = llama_n_vocab(llama.model);
+        const int n_vocab = llama_n_vocab(llama.ctx);
        for (const auto &el : *logit_bias)
        {
            if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
@@ -1180,11 +1185,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
                {
                    if (el[1].is_number())
                    {
-                        sparams.logit_bias[tok] = el[1].get<float>();
+                        llama.params.logit_bias[tok] = el[1].get<float>();
                    }
                    else if (el[1].is_boolean() && !el[1].get<bool>())
                    {
-                        sparams.logit_bias[tok] = -INFINITY;
+                        llama.params.logit_bias[tok] = -INFINITY;
                    }
                }
            }
@@ -1204,32 +1209,9 @@ static void parse_options_completion(const json &body, llama_server_context &lla
        }
    }

-    llama.ctx_sampling = llama_sampling_init(llama.params);
-
    LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
 }

-static void parse_options_infill(const json &body, llama_server_context &llama)
-{
-    if (body.count("input_prefix") != 0)
-    {
-        llama.params.input_prefix = body["input_prefix"];
-    }
-    else
-    {
-        llama.params.input_prefix = "";
-    }
-    if (body.count("input_suffix") != 0)
-    {
-        llama.params.input_suffix = body["input_suffix"];
-    }
-    else
-    {
-        llama.params.input_suffix = "";
-    }
-    parse_options_completion(body, llama);
-}
-
 static void log_server_request(const Request &req, const Response &res)
 {
    LOG_INFO("request", {
@@ -1326,7 +1308,6 @@ int main(int argc, char **argv)
                            {"commit", BUILD_COMMIT}});
    LOG_INFO("system info", {
                                {"n_threads", params.n_threads},
-                                {"n_threads_batch", params.n_threads_batch},
                                {"total_threads", std::thread::hardware_concurrency()},
                                {"system_info", llama_print_system_info()},
                            });
@@ -1390,7 +1371,7 @@ int main(int argc, char **argv)
            if (llama.params.n_beams) {
                // Fill llama.generated_token_probs vector with final beam.
                llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
-                                  llama.n_past, llama.n_remain);
+                                  llama.n_past, llama.n_remain, llama.params.n_threads);
                // Translate llama.generated_token_probs to llama.generated_text.
                append_to_generated_text_from_generated_token_probs(llama);
            } else {
@@ -1414,7 +1395,7 @@ int main(int argc, char **argv)
            }

            auto probs = llama.generated_token_probs;
-            if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
+            if (llama.params.n_probs > 0 && llama.stopped_word) {
                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
            }
@@ -1466,7 +1447,7 @@ int main(int argc, char **argv)

                        std::vector<completion_token_output> probs_output = {};

-                        if (llama.params.sampling_params.n_probs > 0) {
+                        if (llama.params.n_probs > 0) {
                            const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
                            size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
                            size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
@@ -1530,127 +1511,6 @@ int main(int argc, char **argv)
            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        } });

-    svr.Post("/infill", [&llama](const Request &req, Response &res)
-             {
-        auto lock = llama.lock();
-
-        llama.rewind();
-
-        llama_reset_timings(llama.ctx);
-
-        parse_options_infill(json::parse(req.body), llama);
-
-        if (!llama.loadGrammar())
-        {
-            res.status = 400;
-            return;
-        }
-        llama.loadInfill();
-        llama.beginCompletion();
-        const auto chunked_content_provider = [&](size_t, DataSink & sink) {
-            size_t sent_count = 0;
-            size_t sent_token_probs_index = 0;
-
-            while (llama.has_next_token) {
-                const completion_token_output token_with_probs = llama.doCompletion();
-                if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
-                    continue;
-                }
-                const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
-
-                size_t pos = std::min(sent_count, llama.generated_text.size());
-
-                const std::string str_test = llama.generated_text.substr(pos);
-                bool is_stop_full = false;
-                size_t stop_pos =
-                    llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
-                if (stop_pos != std::string::npos) {
-                    is_stop_full = true;
-                    llama.generated_text.erase(
-                        llama.generated_text.begin() + pos + stop_pos,
-                        llama.generated_text.end());
-                    pos = std::min(sent_count, llama.generated_text.size());
-                } else {
-                    is_stop_full = false;
-                    stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
-                        STOP_PARTIAL);
-                }
-
-                if (
-                    stop_pos == std::string::npos ||
-                    // Send rest of the text if we are at the end of the generation
-                    (!llama.has_next_token && !is_stop_full && stop_pos > 0)
-                ) {
-                    const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
-
-                    sent_count += to_send.size();
-
-                    std::vector<completion_token_output> probs_output = {};
-
-                    if (llama.params.sampling_params.n_probs > 0) {
-                        const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
-                        size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
-                        size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
-                        if (probs_pos < probs_stop_pos) {
-                            probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
-                        }
-                        sent_token_probs_index = probs_stop_pos;
-                    }
-
-                    const json data = format_partial_response(llama, to_send, probs_output);
-
-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
-
-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
-
-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
-                    }
-                }
-
-                if (!llama.has_next_token) {
-                    // Generation is done, send extra information.
-                    const json data = format_final_response(
-                        llama,
-                        "",
-                        std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
-                    );
-
-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
-
-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
-
-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
-                    }
-                }
-            }
-
-            llama_print_timings(llama.ctx);
-            sink.done();
-            return true;
-        };
-        const auto on_complete = [&](bool) {
-            llama.mutex.unlock();
-        };
-        lock.release();
-        res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        });
-
    svr.Get("/model.json", [&llama](const Request &, Response &res)
            {
        const json data = format_generation_settings(llama);
@@ -1762,7 +1622,9 @@ int main(int argc, char **argv)
        return 1;
    }

-    llama_sampling_free(llama.ctx_sampling);
+    if (llama.grammar != nullptr) {
+        llama_grammar_free(llama.grammar);
+    }
    llama_backend_free();

    return 0;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -33,28 +33,18 @@ int main(int argc, char ** argv) {

    llama_backend_init(params.numa);

-    // initialize the model
+    llama_context_params ctx_params = llama_context_default_params();

-    llama_model_params model_params = llama_model_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;

-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    // initialize the context
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
-    ctx_params.n_threads = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

    if (ctx == NULL) {
@@ -92,7 +82,7 @@ int main(int argc, char ** argv) {
    // create a llama_batch with size 512
    // we use this object to submit token data for decoding

-    llama_batch batch = llama_batch_init(512, 0, 1);
+    llama_batch batch = llama_batch_init(512, 0);

    // evaluate the initial prompt
    batch.n_tokens = tokens_list.size();
@@ -107,7 +97,7 @@ int main(int argc, char ** argv) {
    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

-    if (llama_decode(ctx, batch) != 0) {
+    if (llama_decode(ctx, batch, params.n_threads) != 0) {
        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }
@@ -122,7 +112,7 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // sample the next token
        {
-            auto   n_vocab = llama_n_vocab(model);
+            auto   n_vocab = llama_n_vocab(ctx);
            auto * logits  = llama_get_logits_ith(ctx, batch.n_tokens - 1);

            std::vector<llama_token_data> candidates;
@@ -164,7 +154,7 @@ int main(int argc, char ** argv) {
        n_cur += 1;

        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (llama_decode(ctx, batch, params.n_threads)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -2,25 +2,13 @@

 #include "common.h"
 #include "llama.h"
+#include "grammar-parser.h"

 #include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>

-struct seq_draft {
-    bool active   = false;
-    bool drafting = false;
-    bool skip     = false;
-
-    int i_batch_dft = 0;
-    std::vector<int> i_batch_tgt;
-
-    std::vector<llama_token> tokens;
-
-    struct llama_sampling_context * ctx_sampling;
-};
-
 int main(int argc, char ** argv) {
    gpt_params params;

@@ -33,13 +21,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    // max number of parallel drafting sequences (i.e. tree branches)
-    const int n_seq_dft = params.n_parallel;
-
-    // TODO: make this configurable
-    const float p_accept = 0.4f;
-    const float p_split  = 0.3f;
-
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
    LOG_TEE("Log start\n");
@@ -89,14 +70,16 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0));
+    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1, 0,           0), params.n_threads);
+    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0), params.n_threads);
+    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input,     0,           0), params.n_threads);

    const auto t_enc_end = ggml_time_us();

    // the 2 models should have the same vocab
-    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
+    const int n_ctx   = llama_n_ctx(ctx_tgt);
+    const int n_vocab = llama_n_vocab(ctx_tgt);
+    //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));

    // how many tokens to draft each time
    int n_draft = params.n_draft;
@@ -108,58 +91,58 @@ int main(int argc, char ** argv) {
    int n_past_tgt = inp.size();
    int n_past_dft = inp.size();

+    std::vector<llama_token> drafted;
+
+    std::vector<llama_token> last_tokens(n_ctx);
+    std::fill(last_tokens.begin(), last_tokens.end(), 0);
+
+    for (auto & id : inp) {
+        last_tokens.erase(last_tokens.begin());
+        last_tokens.push_back(id);
+    }
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
    // used to determine end of generation
    bool has_eos = false;

-    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
+    // grammar stuff
+    struct llama_grammar * grammar_dft = NULL;
+    struct llama_grammar * grammar_tgt = NULL;

-    // draft sequence data
-    std::vector<seq_draft> drafts(n_seq_dft);
+    grammar_parser::parse_state parsed_grammar;

-    params.grammar.clear();             // the draft samplers will copy the target sampler's grammar
-    params.sampling_params.temp = 1.0f; // the draft samplers use default temperature
+    // if requested - load the grammar, error checking is omitted for brevity
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }

-    for (int s = 0; s < n_seq_dft; ++s) {
-        drafts[s].ctx_sampling = llama_sampling_init(params);
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
    }

-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
-
    const auto t_dec_start = ggml_time_us();

-    // sample from the last token of the prompt
-    drafts[0].i_batch_tgt.resize(1);
-    drafts[0].i_batch_tgt[0] = 0;
-
    while (true) {
-        // print current draft sequences
-        for (int s = 0; s < n_seq_dft; ++s) {
-            if (!drafts[s].active) {
-                continue;
-            }
+        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));

-            const auto & tokens = drafts[s].tokens;
-
-            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
-        }
-
-        int i_dft  = 0;
-        int s_keep = 0;
+        int i_dft = 0;

        while (true) {
-            LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
-
            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
+            llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);

-            llama_sampling_accept(ctx_sampling, ctx_tgt, id);
+            // remember which tokens were sampled - used for repetition penalties during sampling
+            last_tokens.erase(last_tokens.begin());
+            last_tokens.push_back(id);

            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));

            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
-
            printf("%s", token_str.c_str());
            fflush(stdout);

@@ -169,68 +152,54 @@ int main(int argc, char ** argv) {

            ++n_predict;

-            // check if the target token matches any of the drafts
-            {
-                bool matches = false;
+            // check if the draft matches the target
+            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
+                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                ++n_accept;
+                ++n_past_tgt;
+                ++n_past_dft;
+                ++i_dft;

-                for (int s = 0; s < n_seq_dft; ++s) {
-                    if (!drafts[s].active) {
-                        continue;
-                    }
-
-                    if (i_dft < (int) drafts[s].tokens.size() && id == drafts[s].tokens[i_dft]) {
-                        LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, id, token_str.c_str());
-
-                        s_keep = s;
-                        matches = true;
-                    } else {
-                        drafts[s].active = false;
-                    }
-                }
-
-                if (matches) {
-                    ++n_accept;
-                    ++n_past_tgt;
-                    ++n_past_dft;
-                    ++i_dft;
-
-                    continue;
-                }
+                continue;
            }

-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            // the drafted token was rejected or we are out of drafted tokens

-            // TODO: simplify
-            {
-                LOG("keeping sequence %d\n", s_keep);
-
-                llama_kv_cache_seq_keep(ctx_dft, s_keep);
-                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_dft, 0);
-
-                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
-                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, 0);
+            if (i_dft < (int) drafted.size()) {
+                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
+                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
+            } else {
+                LOG("out of drafted tokens\n");
            }

-            for (int s = 0; s < n_seq_dft; ++s) {
-                drafts[s].active = false;
-                drafts[s].tokens.clear();
-                drafts[s].i_batch_tgt.clear();
-            }
-            // note: will be erased after the speculation phase
-            drafts[0].tokens.push_back(id);
-            drafts[0].i_batch_tgt.push_back(0);
-
-            llama_batch_clear(batch_dft);
-            llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
-
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            llama_decode         (ctx_dft, batch_dft);
-
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, n_ctx);
+            llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0), params.n_threads);
            ++n_past_dft;

+            // heuristic for n_draft
+            {
+                const int  n_draft_cur  = (int) drafted.size();
+                const bool all_accepted = i_dft == n_draft_cur;
+
+                LOG("n_draft      = %d\n", n_draft);
+                LOG("n_draft_cur  = %d\n", n_draft_cur);
+                LOG("i_dft        = %d\n", i_dft);
+                LOG("all_accepted = %d\n", all_accepted);
+
+                if (all_accepted && n_draft == n_draft_cur) {
+                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
+                    n_draft = std::min(30, n_draft + 8);
+                } else if (all_accepted) {
+                    LOG(" - partially drafted tokens accepted - no change\n");
+                } else {
+                    LOG(" - drafted token rejected - n_draft -= 1\n");
+                    n_draft = std::max(2, n_draft - 1);
+                }
+            }
+
+            drafted.clear();
+            drafted.push_back(id);
+
            break;
        }

@@ -238,158 +207,72 @@ int main(int argc, char ** argv) {
            break;
        }

-        llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
+        if (grammar_tgt) {
+            if (grammar_dft) {
+                llama_grammar_free(grammar_dft);
+            }
+            grammar_dft = llama_grammar_copy(grammar_tgt);

-        int n_seq_cur  = 1;
+            LOG("copied target grammar to draft grammar\n");
+        }
+
+        // sample n_draft tokens from the draft model using greedy decoding
        int n_past_cur = n_past_dft;
-
-        for (int s = 0; s < n_seq_dft; ++s) {
-            drafts[s].active   = false;
-            drafts[s].drafting = false;
-        }
-        drafts[0].active      = true;
-        drafts[0].drafting    = true;
-        drafts[0].i_batch_dft = 0;
-
-        llama_batch_clear(batch_tgt);
-        llama_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
-
-        // sample n_draft tokens from the draft model using tree-based sampling
        for (int i = 0; i < n_draft; ++i) {
-            batch_dft.n_tokens = 0;
+            float * logits = llama_get_logits(ctx_dft);

-            for (int s = 0; s < n_seq_dft; ++s) {
-                drafts[s].skip = false;
+            candidates.clear();
+            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
            }

-            for (int s = 0; s < n_seq_dft; ++s) {
-                if (!drafts[s].drafting || drafts[s].skip) {
-                    continue;
-                }
+            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };

-                llama_sampling_sample(drafts[s].ctx_sampling, ctx_dft, NULL, drafts[s].i_batch_dft);
-
-                const auto & cur_p = drafts[s].ctx_sampling->cur;
-
-                for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p.size()); ++k) {
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
-                            k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
-                }
-
-                if (cur_p[0].p < p_accept) {
-                    LOG("stopping drafting for seq %3d, probability too low: %.3f < 2*%.3f\n", s, cur_p[0].p, cur_p[1].p);
-                    drafts[s].drafting = false;
-                    continue;
-                }
-
-                std::vector<int> sa(1, s);
-
-                // attempt to split the branch if the probability is high enough
-                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
-
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
-
-                        // all previous tokens from this branch are now also part of the new branch
-                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
-                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
-                                if (batch_tgt.seq_id[t][p] == s) {
-                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
-                                    batch_tgt.n_seq_id[t]++;
-                                    break;
-                                }
-                            }
-                        }
-
-                        // copy the draft state
-                        drafts[n_seq_cur].active   = true;
-                        drafts[n_seq_cur].drafting = true;
-                        drafts[n_seq_cur].skip     = true;
-
-                        drafts[n_seq_cur].tokens      = drafts[s].tokens;
-                        drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
-                        drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
-
-                        llama_sampling_cp(drafts[s].ctx_sampling, drafts[n_seq_cur].ctx_sampling);
-
-                        sa.push_back(n_seq_cur);
-
-                        n_seq_cur++;
-                    } else {
-                        break;
-                    }
-                }
-
-                // add drafted token for each sequence
-                for (int is = 0; is < (int) sa.size(); ++is) {
-                    const llama_token id = cur_p[is].id;
-
-                    const int s = sa[is];
-
-                    llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
-
-                    drafts[s].tokens.push_back(id);
-
-                    // add unique drafted tokens to the target batch
-                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
-
-                    llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
-
-                    // no need to evaluate the last drafted token, since we won't use the result
-                    if (batch_tgt.n_tokens > n_draft) {
-                        drafts[s].drafting = false;
-                        continue;
-                    }
-
-                    // add the token to the batch for batched decoding with the draft model
-                    drafts[s].i_batch_dft = batch_dft.n_tokens;
-
-                    llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
-                }
+            if (grammar_dft != NULL) {
+                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
            }

-            // no sequence is drafting anymore
-            if (batch_dft.n_tokens == 0) {
+            // computes softmax and sorts the candidates
+            llama_sample_softmax(ctx_dft, &cur_p);
+
+            for (int i = 0; i < 3; ++i) {
+                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
+            }
+
+            // TODO: better logic?
+            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
+                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
                break;
            }

-            // evaluate the drafted tokens on the draft model
-            llama_decode(ctx_dft, batch_dft);
+            // drafted token
+            const llama_token id = cur_p.data[0].id;
+
+            drafted.push_back(id);
+            ++n_drafted;
+
+            // no need to evaluate the last drafted token, since we won't use the result
+            if (i == n_draft - 1) {
+                break;
+            }
+
+            // evaluate the drafted token on the draft model
+            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, n_ctx);
+            llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0), params.n_threads);
            ++n_past_cur;
-            ++n_drafted;

-            if (batch_tgt.n_tokens > n_draft) {
-                break;
+            if (grammar_dft != NULL) {
+                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
            }
        }

-        // account for the last drafted token that we didn't evaluate
-        if (batch_tgt.n_tokens > n_draft) {
-            ++n_drafted;
-        }
-
        // evaluate the target model on the drafted tokens
-        {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
-            for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
-            }
+        llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, n_ctx);
+        llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0), params.n_threads);
+        ++n_past_tgt;

-            //LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
-            llama_decode(ctx_tgt, batch_tgt);
-            ++n_past_tgt;
-        }
-
-        // the first token is always proposed by the traget model before the speculation loop so we erase it here
-        for (int s = 0; s < n_seq_dft; ++s) {
-            if (!drafts[s].active) {
-                continue;
-            }
-
-            drafts[s].tokens.erase(drafts[s].tokens.begin());
-        }
+        // the first token is always proposed by the traget model before the speculation loop
+        drafted.erase(drafted.begin());
    }

    auto t_dec_end = ggml_time_us();
@@ -397,8 +280,9 @@ int main(int argc, char ** argv) {
    LOG_TEE("\n\n");

    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));

+    // TODO: make sure these numbers are computed correctly
    LOG_TEE("\n");
    LOG_TEE("n_draft   = %d\n", n_draft);
    LOG_TEE("n_predict = %d\n", n_predict);
@@ -412,19 +296,16 @@ int main(int argc, char ** argv) {
    LOG_TEE("\ntarget:\n");
    llama_print_timings(ctx_tgt);

-    llama_sampling_free(ctx_sampling);
-    for (int s = 0; s < n_seq_dft; ++s) {
-        llama_sampling_free(drafts[s].ctx_sampling);
-    }
-
-    llama_batch_free(batch_dft);
-
    llama_free(ctx_tgt);
    llama_free_model(model_tgt);

    llama_free(ctx_dft);
    llama_free_model(model_dft);

+    if (grammar_dft != NULL) {
+        llama_grammar_free(grammar_dft);
+        llama_grammar_free(grammar_tgt);
+    }
    llama_backend_free();

    fprintf(stderr, "\n\n");
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 ./bin/train-text-from-scratch \
        --vocab-model ../models/ggml-vocab-llama.gguf \
        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16-LATEST.gguf \
-        --checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
-        --model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
+        --checkpoint-in  chk-shakespeare-256x16.gguf \
+        --checkpoint-out chk-shakespeare-256x16.gguf \
+        --model-out ggml-shakespeare-256x16-f32.gguf \
        --train-data "shakespeare.txt" \
        -t 6 -b 16 --seed 1 --adam-iter 256 \
        --no-checkpointing
@@ -20,8 +20,3 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 # predict
 ./bin/main -m ggml-shakespeare-256x16-f32.gguf
 ```
-
-Output files will be saved every N iterations (config with `--save-every N`).
-The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
-
-To train GGUF models just pass them to `--checkpoint-in FN`.
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -47,13 +47,10 @@ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
 LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"

-LLM_KV_TRAINING_TYPE_TRAIN_MODEL   = "train_model"
-LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
-LLM_KV_TRAINING_TYPE               = "training.type"
-LLM_KV_TRAINING_FILE_VERSION       = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT    = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT       = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT        = "training.token_count"
+LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
+LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
+LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
+LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"

 class Tensor:
    def __init__(self, dtype='f', ne=None):
@@ -364,7 +361,7 @@ class ModelParams:
        gguf_writer.add_feed_forward_length(self.get_n_ff())

 def tensor_name(key, bid=None):
-    return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
+    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"

 class Layer:
    def __init__(self, params, bid):
@@ -463,7 +460,6 @@ class Checkpoint:
        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
        gguf_writer.add_layer_norm_rms_eps(1e-5)
        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_string(LLM_KV_TRAINING_TYPE,            LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/flake.nix
+++ b/flake.nix
@@ -62,7 +62,7 @@
          mkdir -p $out/include
          cp ${src}/llama.h $out/include/
        '';
-        cmakeFlags = [ "-DLLAMA_NATIVE=OFF" "-DLLAMA_BUILD_SERVER=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
+        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -1,5 +1,4 @@
 #include "ggml-alloc.h"
-#include "ggml-backend.h"
 #include "ggml.h"
 #include <assert.h>
 #include <stdarg.h>
@@ -7,6 +6,25 @@
 #include <stdlib.h>
 #include <string.h>

+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/types.h>
+            #include <sys/mman.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <memoryapi.h>
+#endif
+

 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -59,12 +77,11 @@ struct free_block {
    size_t size;
 };

-#define MAX_FREE_BLOCKS 256
+#define MAX_FREE_BLOCKS 128

 struct ggml_allocr {
-    struct ggml_backend_buffer * buffer;
-    bool buffer_owned;
    void * data;
+    size_t size;
    size_t alignment;
    int n_free_blocks;
    struct free_block free_blocks[MAX_FREE_BLOCKS];
@@ -102,9 +119,16 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif

+static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+
+    UNUSED(alloc);
+}
+
 // check if a tensor is allocated by this buffer
 static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    return tensor->buffer == alloc->buffer;
+    void * ptr = tensor->data;
+    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
 }

 static bool ggml_is_view(struct ggml_tensor * t) {
@@ -112,10 +136,11 @@ static bool ggml_is_view(struct ggml_tensor * t) {
 }

 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+#ifdef GGML_ALLOCATOR_DEBUG
    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
+#endif
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);

    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -162,9 +187,6 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    }

    tensor->data = addr;
-    AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
-    tensor->buffer = alloc->buffer;
-    ggml_backend_buffer_init_tensor(alloc->buffer, tensor);

 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, tensor);
@@ -185,21 +207,18 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)

 // this is a very naive implementation, but for our case the number of free blocks should be very small
 static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+
    if (ggml_allocr_is_own(alloc, tensor) == false) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
-        AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
        return;
    }

-    void * ptr = tensor->data;
-
-    size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
+    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
-    AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
-
-    ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);

 #ifdef GGML_ALLOCATOR_DEBUG
    remove_allocated_tensor(alloc, tensor);
@@ -264,18 +283,15 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
    alloc->n_free_blocks = 1;
    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
-    alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
+    alloc->free_blocks[0].size = alloc->size - align_offset;
 }

 struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
-    struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
-
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);

    *alloc = (struct ggml_allocr){
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ true,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
+        /*.data          = */ data,
+        /*.size          = */ size,
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@@ -294,26 +310,74 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }

-struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
-    struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
-    alloc->measure = true;
-
-    return alloc;
+// OS specific functions to allocate and free uncommitted virtual memory
+static void * alloc_vmem(size_t size) {
+#if defined(_WIN32)
+    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
+#elif defined(_POSIX_MAPPED_FILES)
+    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+    return ptr;
+#else
+    // use a fixed address for other platforms
+    uintptr_t base_addr = (uintptr_t)-size - 0x100;
+    return (void *)base_addr;
+#endif
 }

-struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
-    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
+static void free_vmem(void * base_addr, size_t size) {
+#if defined(_WIN32)
+    VirtualFree(base_addr, 0, MEM_RELEASE);
+    UNUSED(size);
+#elif defined(_POSIX_MAPPED_FILES)
+    munmap(base_addr, size);
+#else
+    // nothing to do
+    UNUSED(base_addr);
+    UNUSED(size);
+#endif
+}
+
+// allocate uncommitted virtual memory to measure the size of the graph
+static void alloc_measure_vmem(void ** base_addr, size_t * size) {
+    // 128GB for 64-bit, 1GB for 32-bit
+    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
+    do {
+        *base_addr = alloc_vmem(*size);
+        if (*base_addr != NULL) {
+            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
+            return;
+        }
+        // try again with half the size
+        *size /= 2;
+    } while (*size > 0);
+
+    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
+}
+
+static void free_measure_vmem(void * base_addr, size_t size) {
+    free_vmem(base_addr, size);
+}
+
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    void * base_addr;
+    size_t size;
+
+    alloc_measure_vmem(&base_addr, &size);

    *alloc = (struct ggml_allocr){
-        /*.buffer        = */ buffer,
-        /*.buffer_owned  = */ false,
-        /*.base          = */ ggml_backend_buffer_get_base(buffer),
-        /*.alignment     = */ ggml_backend_buffer_get_alignment(buffer),
+        /*.data          = */ base_addr,
+        /*.size          = */ size,
+        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
-        /*.measure       = */ false,
+        /*.measure       = */ true,
        /*.parse_seq     = */ {0},
        /*.parse_seq_len = */ 0,
 #ifdef GGML_ALLOCATOR_DEBUG
@@ -327,8 +391,8 @@ struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * bu
 }

 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->buffer_owned) {
-        ggml_backend_buffer_free(alloc->buffer);
+    if (alloc->measure) {
+        free_measure_vmem(alloc->data, alloc->size);
    }
    free(alloc);
 }
@@ -371,6 +435,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_ROPE:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SOFT_MAX:
+        case GGML_OP_CONT:
            return true;

        default:
@@ -378,23 +443,12 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
    }
 }

-static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
-    assert(view->view_src != NULL && view->view_src->data != NULL);
-    view->backend = view->view_src->backend;
-    view->buffer  = view->view_src->buffer;
-    view->data    = (char *)view->view_src->data + view->view_offs;
-
-    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
-    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
-    assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
-    ggml_backend_buffer_init_tensor(alloc->buffer, view);
-}
-
 static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            init_view(alloc, node);
+            assert(node->view_src->data != NULL);
+            node->data = (char *)node->view_src->data + node->view_offs;
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@@ -422,17 +476,13 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
-                                node->view_src = view_src;
-                                view_src_hn->n_views += 1;
-                                init_view(alloc, node);
+                                node->data = parent->data;
                                return;
                            }
                        }
                        else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
-                            node->view_src = parent;
-                            p_hn->n_views += 1;
-                            init_view(alloc, node);
+                            node->data = parent->data;
                            return;
                        }
                    }
@@ -443,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }

-size_t ggml_allocr_alloc_graph_n(
+static size_t ggml_allocr_alloc_graph_tensors_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -461,10 +511,6 @@ size_t ggml_allocr_alloc_graph_n(
            if (ggml_is_view(node)) {
                struct ggml_tensor * view_src = node->view_src;
                hash_get(ht, view_src)->n_views += 1;
-                if (node->buffer == NULL && node->data != NULL) {
-                    // view of a pre-allocated tensor, didn't call init_view() yet
-                    init_view(alloc, node);
-                }
            }

            for (int j = 0; j < GGML_MAX_SRC; j++) {
@@ -473,9 +519,6 @@ size_t ggml_allocr_alloc_graph_n(
                    break;
                }
                hash_get(ht, parent)->n_children += 1;
-                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                    init_view(alloc, parent);
-                }
            }
        }
    }
@@ -586,9 +629,5 @@ size_t ggml_allocr_alloc_graph_n(
 }

 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
-}
-
-size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
-    return alloc->max_size;
+    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -6,27 +6,20 @@
 extern "C" {
 #endif

-struct ggml_backend_buffer;

 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
-GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);

 // tell the allocator to parse nodes following the order described in the list
 // you should call this if your graph are optimized to execute out-of-order
 GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);

-GGML_API void   ggml_allocr_free       (struct ggml_allocr * alloc);
-GGML_API bool   ggml_allocr_is_measure (struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_reset      (struct ggml_allocr * alloc);
-GGML_API void   ggml_allocr_alloc      (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
 GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
-GGML_API size_t ggml_allocr_max_size   (struct ggml_allocr * alloc);

-GGML_API size_t ggml_allocr_alloc_graph_n(
-                    struct ggml_allocr * alloc,
-                    struct ggml_cgraph ** graphs, int n_graphs,
-                    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);

 #ifdef  __cplusplus
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1,385 +0,0 @@
-#include "ggml-backend.h"
-#include "ggml-alloc.h"
-
-#include <assert.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define UNUSED GGML_UNUSED
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-// backend buffer
-
-ggml_backend_buffer_t ggml_backend_buffer_init(
-        struct ggml_backend                  * backend,
-        struct ggml_backend_buffer_i           iface,
-               ggml_backend_buffer_context_t   context,
-               size_t                          size) {
-    ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
-
-    GGML_ASSERT(iface.get_base != NULL);
-
-    (*buffer) = (struct ggml_backend_buffer) {
-        /* .interface = */ iface,
-        /* .backend   = */ backend,
-        /* .context   = */ context,
-        /* .size      = */ size,
-    };
-
-    return buffer;
-}
-
-void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
-    if (buffer->iface.free_buffer != NULL) {
-        buffer->iface.free_buffer(buffer);
-    }
-    free(buffer);
-}
-
-size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
-    return ggml_backend_get_alignment(buffer->backend);
-}
-
-void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return buffer->iface.get_base(buffer);
-}
-
-size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
-    return buffer->size;
-}
-
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    if (buffer->iface.get_alloc_size) {
-        return buffer->iface.get_alloc_size(buffer, tensor);
-    }
-    return ggml_nbytes(tensor);
-}
-
-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    if (buffer->iface.init_tensor) {
-        buffer->iface.init_tensor(buffer, tensor);
-    }
-}
-
-void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    if (buffer->iface.free_tensor) {
-        buffer->iface.free_tensor(buffer, tensor);
-    }
-}
-
-// backend
-
-ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
-    return tensor->buffer->backend;
-}
-
-const char * ggml_backend_name(ggml_backend_t backend) {
-    return backend->iface.get_name(backend);
-}
-
-void ggml_backend_free(ggml_backend_t backend) {
-    backend->iface.free(backend);
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
-    return backend->iface.alloc_buffer(backend, size);
-}
-
-size_t ggml_backend_get_alignment(ggml_backend_t backend) {
-    return backend->iface.get_alignment(backend);
-}
-
-void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
-}
-
-void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
-}
-
-void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
-    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
-}
-
-void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
-    ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
-}
-
-void ggml_backend_synchronize(ggml_backend_t backend) {
-    backend->iface.synchronize(backend);
-}
-
-ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_plan_create(backend, cgraph);
-}
-
-void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_free(backend, plan);
-}
-
-void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    backend->iface.graph_plan_compute(backend, plan);
-}
-
-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
-}
-
-bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return backend->iface.supports_op(backend, op);
-}
-
-// backend copy
-
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
-    if (a->type != b->type) {
-        return false;
-    }
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (a->ne[i] != b->ne[i]) {
-            return false;
-        }
-        if (a->nb[i] != b->nb[i]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
-    //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
-    //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
-    GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
-    // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
-
-    if (src == dst) {
-        return;
-    }
-
-    // TODO: allow backends to support copy to/from same backend
-
-    if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
-        ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
-    } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
-        ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
-    } else {
-        // shouldn't be hit when copying from/to CPU
-        #ifndef NDEBUG
-        fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
-        #endif
-        size_t nbytes = ggml_nbytes(src);
-        void * data = malloc(nbytes);
-        ggml_backend_tensor_get(src, data, 0, nbytes);
-        ggml_backend_tensor_set(dst, data, 0, nbytes);
-        free(data);
-    }
-}
-
-// backend CPU
-
-struct ggml_backend_cpu_context {
-    int n_threads;
-    void * work_data;
-    size_t work_size;
-};
-
-static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
-    return "CPU";
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-    free(cpu_ctx->work_data);
-    free(cpu_ctx);
-    free(backend);
-}
-
-static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)buffer->context;
-}
-
-static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-    UNUSED(buffer);
-}
-
-static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
-    /* .free_buffer    = */ ggml_backend_cpu_buffer_free_buffer,
-    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
-    /* .init_tensor    = */ NULL, // no initialization required
-    /* .free_tensor    = */ NULL, // no cleanup required
-};
-
-// for buffers from ptr, free is not called
-static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
-    /* .free_buffer    = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
-    /* .get_base       = */ ggml_backend_cpu_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
-    /* .init_tensor    = */ NULL,
-    /* .free_tensor    = */ NULL,
-};
-
-static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-
-static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
-    size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
-    void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
-
-    return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
-}
-
-static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
-    return TENSOR_ALIGNMENT;
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy((char *)tensor->data + offset, data, size);
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
-    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
-
-    UNUSED(backend);
-}
-
-struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
-    struct ggml_cgraph cgraph;
-};
-
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
-
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-    cpu_plan->cgraph = *cgraph;
-
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-    }
-
-    return cpu_plan;
-}
-
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    free(cpu_plan->cplan.work_data);
-    free(cpu_plan);
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
-    ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-
-    if (cpu_ctx->work_size < cplan.work_size) {
-        // TODO: may be faster to free and use malloc to avoid the copy
-        cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
-        cpu_ctx->work_size = cplan.work_size;
-    }
-
-    cplan.work_data = cpu_ctx->work_data;
-
-    ggml_graph_compute(cgraph, &cplan);
-}
-
-static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return true;
-    UNUSED(backend);
-    UNUSED(op);
-}
-
-static struct ggml_backend_i cpu_backend_i = {
-    /* .get_name            = */ ggml_backend_cpu_name,
-    /* .free                = */ ggml_backend_cpu_free,
-    /* .alloc_buffer        = */ ggml_backend_cpu_alloc_buffer,
-    /* .get_alignment       = */ ggml_backend_cpu_get_alignment,
-    /* .set_tensor_async    = */ ggml_backend_cpu_set_tensor_async,
-    /* .get_tensor_async    = */ ggml_backend_cpu_get_tensor_async,
-    /* .synchronize         = */ ggml_backend_cpu_synchronize,
-    /* .cpy_tensor_from     = */ ggml_backend_cpu_cpy_tensor_from,
-    /* .cpy_tensor_to       = */ ggml_backend_cpu_cpy_tensor_to,
-    /* .graph_plan_create   = */ ggml_backend_cpu_graph_plan_create,
-    /* .graph_plan_free     = */ ggml_backend_cpu_graph_plan_free,
-    /* .graph_plan_compute  = */ ggml_backend_cpu_graph_plan_compute,
-    /* .graph_compute       = */ ggml_backend_cpu_graph_compute,
-    /* .supports_op         = */ ggml_backend_cpu_supports_op,
-};
-
-ggml_backend_t ggml_backend_cpu_init(void) {
-    struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
-
-    ctx->n_threads = GGML_DEFAULT_N_THREADS;
-    ctx->work_data = NULL;
-    ctx->work_size = 0;
-
-    ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
-
-    *cpu_backend = (struct ggml_backend) {
-        /* .interface = */ cpu_backend_i,
-        /* .context   = */ ctx
-    };
-    return cpu_backend;
-}
-
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_cpu_name;
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
-    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
-    ctx->n_threads = n_threads;
-}
-
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
-    return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
-}
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -1,143 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-    struct ggml_backend;
-    struct ggml_backend_buffer;
-
-    // type-erased backend-specific types / wrappers
-    typedef void * ggml_backend_context_t;
-    typedef void * ggml_backend_graph_plan_t;
-    typedef void * ggml_backend_buffer_context_t;
-
-    // avoid accessing internals of these types
-    typedef struct ggml_backend        * ggml_backend_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-
-    //
-    // backend buffer
-    //
-
-    struct ggml_backend_buffer_i {
-        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
-        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
-        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
-        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
-        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
-    };
-
-    // TODO: hide behind API
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i iface;
-
-        ggml_backend_t                backend;
-        ggml_backend_buffer_context_t context;
-
-        size_t size;
-    };
-
-    // backend buffer functions
-    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
-            struct ggml_backend                  * backend,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-
-    //
-    // backend
-    //
-
-    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
-
-        void (*free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
-
-        // get buffer alignment
-        size_t (*get_alignment)(ggml_backend_t backend);
-
-        // tensor data access
-        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        void (*synchronize)     (ggml_backend_t backend);
-
-        // (optional) copy tensor between different backends, allow for single-copy tranfers
-        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-    };
-
-    // TODO: hide behind API
-    struct ggml_backend {
-        struct ggml_backend_i iface;
-
-        ggml_backend_context_t context;
-    };
-
-    // backend helper functions
-    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
-
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-
-    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
-
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,7 +1,6 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-backend.h"

 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
@@ -43,9 +42,6 @@ GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_cuda_get_device_count(void);
 GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

-// backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
-
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -20,7 +20,6 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-backend.h"

 #include <stddef.h>
 #include <stdbool.h>
@@ -36,15 +35,10 @@ struct ggml_cgraph;
 extern "C" {
 #endif

-//
-// internal API
-// temporary exposed to user-code
-//
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);

 struct ggml_metal_context;

-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
 // number of command buffers to use
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
@@ -89,17 +83,6 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

-//
-// backend API
-// user-code should use only these functions
-//
-
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
-
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -81,18 +81,18 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
-    GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
-    GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
-    GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
+    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
@@ -109,8 +109,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
-    GGML_METAL_DECL_KERNEL(concat);
-    GGML_METAL_DECL_KERNEL(sqr);

 #undef GGML_METAL_DECL_KERNEL
 };
@@ -185,44 +183,56 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {

    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

-    // load library
+#ifdef GGML_SWIFT
+    // load the default.metallib file
    {
-        NSBundle * bundle = nil;
-#ifdef SWIFT_PACKAGE
-        bundle = SWIFTPM_MODULE_BUNDLE;
-#else
-        bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-#endif
        NSError * error = nil;
-        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
-        if (libPath != nil) {
-            NSURL * libURL = [NSURL fileURLWithPath:libPath];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
-            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
-        } else {
-            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);

-            NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
-            NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
-            if (error) {
-                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-                return NULL;
-            }
+        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+        NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
+        NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
+        NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
+        NSURL * libURL = [NSURL fileURLWithPath:libPath];

-            MTLCompileOptions* options = nil;
-#ifdef GGML_QKK_64
-            options = [MTLCompileOptions new];
-            options.preprocessorMacros = @{ @"QK_K" : @(64) };
-#endif
-            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
-        }
+        // Load the metallib file into a Metal library
+        ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];

        if (error) {
            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
+#else
+    UNUSED(msl_library_source);
+
+    // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
+    {
+        NSError * error = nil;
+
+        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
+        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+        NSString * path   = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+        GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
+
+        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
+        if (error) {
+            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            return NULL;
+        }
+
+#ifdef GGML_QKK_64
+        MTLCompileOptions* options = [MTLCompileOptions new];
+        options.preprocessorMacros = @{ @"QK_K" : @(64) };
+        ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+#else
+        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
+#endif
+        if (error) {
+            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            return NULL;
+        }
+    }
+#endif

    // load kernels
    {
@@ -262,57 +272,40 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
-        GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
-        GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
-        GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
-        if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
-            GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
-            GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
-        }
+        GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
+        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
        GGML_METAL_ADD_KERNEL(rope_f32);
        GGML_METAL_ADD_KERNEL(rope_f16);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
-        GGML_METAL_ADD_KERNEL(concat);
-        GGML_METAL_ADD_KERNEL(sqr);

 #undef GGML_METAL_ADD_KERNEL
    }

-#if TARGET_OS_OSX
-    // print MTL GPU family:
-    GGML_METAL_LOG_INFO("%s: GPU name:   %s\n", __func__, [[ctx->device name] UTF8String]);
-
-    // determine max supported GPU family
-    // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
-    // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-    for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
-        if ([ctx->device supportsFamily:i]) {
-            GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
-            break;
-        }
-    }
-
    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
+#if TARGET_OS_OSX
    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
    if (ctx->device.maxTransferRate != 0) {
        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
@@ -354,38 +347,34 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
-    GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
-    GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
-    GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
-    if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
-        GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
-        GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
-    }
+    GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
+    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
    GGML_METAL_DEL_KERNEL(rope_f32);
    GGML_METAL_DEL_KERNEL(rope_f16);
    GGML_METAL_DEL_KERNEL(alibi_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
-    GGML_METAL_DEL_KERNEL(concat);
-    GGML_METAL_DEL_KERNEL(sqr);

 #undef GGML_METAL_DEL_KERNEL

@@ -442,7 +431,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
    for (int i = 0; i < ctx->n_buffers; ++i) {
        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;

-        //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
+        //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;

@@ -777,44 +766,6 @@ void ggml_metal_graph_compute(
                        {
                            // noop
                        } break;
-                    case GGML_OP_CONCAT:
-                        {
-                            const int64_t nb = ne00;
-
-                            [encoder setComputePipelineState:ctx->pipeline_concat];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
-
-                            const int nth = MIN(1024, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
                    case GGML_OP_ADD:
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));
@@ -910,10 +861,9 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];

-                            const int64_t n = ggml_nelements(dst);
-                            GGML_ASSERT(n % 4 == 0);
+                            const int64_t n = ggml_nelements(dst)/4;

-                            [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_UNARY:
                        switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -923,10 +873,9 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
+                                    const int64_t n = ggml_nelements(dst)/4;

-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
@@ -944,10 +893,9 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
+                                    const int64_t n = ggml_nelements(dst)/4;

-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            default:
                                {
@@ -955,17 +903,6 @@ void ggml_metal_graph_compute(
                                    GGML_ASSERT(false);
                                }
                        } break;
-                    case GGML_OP_SQR:
-                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            [encoder setComputePipelineState:ctx->pipeline_sqr];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
                            const int nth = MIN(32, ne00);
@@ -1007,46 +944,21 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL_MAT:
                        {
+                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
+
                            GGML_ASSERT(ne00 == ne10);
+                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
+                            uint gqa = ne12/ne02;
                            GGML_ASSERT(ne03 == ne13);

-                            const uint gqa = ne12/ne02;
-
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
-
-#if 0
-                            // the numbers below are measured on M2 Ultra for 7B and 13B models
-                            // these numbers do not translate to other devices or model sizes
-                            // TODO: need to find a better approach
-                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                                switch (src0t) {
-                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q4_0:
-                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                    case GGML_TYPE_Q5_0:                          // not tested yet
-                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                    default:             ne11_mm_min = 1;  break;
-                                }
-                            }
-#endif
-
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                !ggml_is_transposed(src0) &&
+                            if (!ggml_is_transposed(src0) &&
                                !ggml_is_transposed(src1) &&
                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                ne11 > ne11_mm_min) {
-                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
+                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne00%32 == 0 &&
+                                ne11 > 2) {
                                switch (src0->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
@@ -1075,18 +987,17 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:13];
                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);

                                // use custom matrix x vector kernel
                                switch (src0t) {
                                    case GGML_TYPE_F32:
                                        {
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
                                            nrows = 4;
                                        } break;
                                    case GGML_TYPE_F16:
@@ -1094,12 +1005,12 @@ void ggml_metal_graph_compute(
                                            nth0 = 32;
                                            nth1 = 1;
                                            if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
                                                nrows = ne11;
                                            } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
+                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                                nrows = 4;
                                            }
                                        } break;
@@ -1110,7 +1021,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 8;
                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
                                        } break;
                                    case GGML_TYPE_Q4_1:
                                        {
@@ -1119,7 +1030,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 8;
                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
                                        } break;
                                    case GGML_TYPE_Q8_0:
                                        {
@@ -1128,7 +1039,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 8;
                                            nth1 = 8;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
                                        } break;
                                    case GGML_TYPE_Q2_K:
                                        {
@@ -1137,7 +1048,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
                                        } break;
                                    case GGML_TYPE_Q3_K:
                                        {
@@ -1146,7 +1057,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
                                        } break;
                                    case GGML_TYPE_Q4_K:
                                        {
@@ -1155,7 +1066,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 4; //1;
                                            nth1 = 8; //32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
                                        {
@@ -1164,7 +1075,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
                                        {
@@ -1173,7 +1084,7 @@ void ggml_metal_graph_compute(

                                            nth0 = 2;
                                            nth1 = 32;
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
                                        } break;
                                    default:
                                        {
@@ -1202,7 +1113,7 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];

                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q4_K) {
@@ -1255,8 +1166,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));

@@ -1299,14 +1208,17 @@ void ggml_metal_graph_compute(

                            const int nth = MIN(1024, ne00);

-                            //const int n_past = ((int32_t *) dst->op_params)[0];
+                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
                            const int n_head = ((int32_t *) dst->op_params)[1];
                            float max_bias;
                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

+                            if (__builtin_popcount(n_head) != 1) {
+                                GGML_ASSERT(false && "only power-of-two n_head implemented");
+                            }
+
                            const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
                            const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-                            const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);

                            [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1327,9 +1239,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
-                            [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
-                            [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
+                            [encoder setBytes:&m0  length:sizeof(    float) atIndex:18];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@@ -1462,140 +1372,3 @@ void ggml_metal_graph_compute(

    }
 }
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-static const char * ggml_backend_metal_name(ggml_backend_t backend) {
-    return "Metal";
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-    ggml_metal_free(ctx);
-    free(backend);
-}
-
-static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return (void *)buffer->context;
-}
-
-static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    free(buffer->context);
-    UNUSED(buffer);
-}
-
-static struct ggml_backend_buffer_i metal_backend_buffer_i = {
-    /* .free_buffer    = */ ggml_backend_metal_buffer_free_buffer,
-    /* .get_base       = */ ggml_backend_metal_buffer_get_base,
-    /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
-    /* .init_tensor    = */ NULL, // no initialization required
-    /* .free_tensor    = */ NULL, // no cleanup required
-};
-
-static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-
-    void * data = ggml_metal_host_malloc(size);
-
-    // TODO: set proper name of the buffers
-    ggml_metal_add_buffer(ctx, "backend", data, size, 0);
-
-    return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
-}
-
-static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
-    return 32;
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy((char *)tensor->data + offset, data, size);
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
-    memcpy(data, (const char *)tensor->data + offset, size);
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
-    ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
-
-    ggml_metal_graph_compute(metal_ctx, cgraph);
-}
-
-static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return true;
-    UNUSED(backend);
-    UNUSED(op);
-}
-
-static struct ggml_backend_i metal_backend_i = {
-    /* .get_name            = */ ggml_backend_metal_name,
-    /* .free                = */ ggml_backend_metal_free,
-    /* .alloc_buffer        = */ ggml_backend_metal_alloc_buffer,
-    /* .get_alignment       = */ ggml_backend_metal_get_alignment,
-    /* .set_tensor_async    = */ ggml_backend_metal_set_tensor_async,
-    /* .get_tensor_async    = */ ggml_backend_metal_get_tensor_async,
-    /* .synchronize         = */ ggml_backend_metal_synchronize,
-    /* .cpy_tensor_from     = */ ggml_backend_metal_cpy_tensor_from,
-    /* .cpy_tensor_to       = */ ggml_backend_metal_cpy_tensor_to,
-    /* .graph_plan_create   = */ NULL, // the metal implementation does not require creating graph plans atm
-    /* .graph_plan_free     = */ NULL,
-    /* .graph_plan_compute  = */ NULL,
-    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
-    /* .supports_op         = */ ggml_backend_metal_supports_op,
-};
-
-ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-
-    ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
-
-    ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
-
-    *metal_backend = (struct ggml_backend) {
-        /* .interface = */ metal_backend_i,
-        /* .context   = */ ctx,
-    };
-
-    return metal_backend;
-}
-
-bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_metal_name;
-}
-
-void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
-
-    ggml_metal_set_n_cb(ctx, n_cb);
-}
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -13,8 +13,8 @@ typedef struct {

 #define QK4_1 32
 typedef struct {
-    half d;                 // delta
-    half m;                 // min
+    half d;          // delta
+    half m;          // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;

@@ -132,13 +132,6 @@ kernel void kernel_relu(
    dst[tpig] = max(0.0f, src0[tpig]);
 }

-kernel void kernel_sqr(
-        device const float * src0,
-        device       float * dst,
-        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src0[tpig];
-}
-
 constant float GELU_COEF_A    = 0.044715f;
 constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;

@@ -345,11 +338,10 @@ kernel void kernel_rms_norm(
        uint sgitg[[simdgroup_index_in_threadgroup]],
        uint tiisg[[thread_index_in_simdgroup]],
        uint   ntg[[threads_per_threadgroup]]) {
-    device const float4 * x        = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
-    device const float  * x_scalar = (device const float  *) x;
-
-    float4 sumf = 0;
-    float all_sum = 0;
+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
+    device const float * x_scalar = (device const float *) x;
+    float4 sumf=0;
+    float all_sum=0;

    // parallel sum
    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
@@ -362,7 +354,6 @@ kernel void kernel_rms_norm(
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);
-
    // broadcast, simd group number is ntg / 32
    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
       if (tpitg < i) {
@@ -370,9 +361,7 @@ kernel void kernel_rms_norm(
       }
    }
    if (tpitg == 0) {
-        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
-            sum[0] += x_scalar[i];
-        }
+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
        sum[0] /= ne00;
    }

@@ -387,9 +376,7 @@ kernel void kernel_rms_norm(
        y[i00] = x[i00] * scale;
    }
    if (tpitg == 0) {
-        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
-            y_scalar[i00] = x_scalar[i00] * scale;
-        }
+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
    }
 }

@@ -429,8 +416,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
 }

 // putting them in the kernel cause a significant performance penalty
-#define N_DST 4        // each SIMD group works on 4 rows
-#define N_SIMDGROUP 2  // number of SIMD groups in a thread group
+#define N_DST 4 // each SIMD group works on 4 rows
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 32 // assuming SIMD group size is 32
 //Note: This is a template, but strictly speaking it only applies to
 //      quantizations where the block size is 32. It also does not
@@ -441,23 +428,18 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
                    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
                    uint3 tgpig, uint tiisg, uint sgitg) {
    const int nb = ne00/QK4_0;
-
    const int r0 = tgpig.x;
    const int r1 = tgpig.y;
    const int im = tgpig.z;
-
    const int first_row = (r0 * nsg + sgitg) * nr;
-
    const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
-
    device const block_q_type * x = (device const block_q_type *) src0 + offset0;
    device const float        * y = (device const float        *) src1 + r1*ne10 + im*ne00*ne1;
+    float yl[16];       // src1 vector cache
+    float sumf[nr]={0.f};

-    float yl[16]; // src1 vector cache
-    float sumf[nr] = {0.f};
-
-    const int ix = (tiisg/2);
-    const int il = (tiisg%2)*8;
+    const int ix = tiisg/2;
+    const int il = 8*(tiisg%2);

    device const float * yb = y + ix * QK4_0 + il;

@@ -468,7 +450,6 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
            sumy += yb[i] + yb[i+1];
            yl[i+0] = yb[i+ 0];
            yl[i+1] = yb[i+ 1]/256.f;
-
            sumy += yb[i+16] + yb[i+17];
            yl[i+8] = yb[i+16]/16.f;
            yl[i+9] = yb[i+17]/4096.f;
@@ -484,12 +465,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
    for (int row = 0; row < nr; ++row) {
        const float tot = simd_sum(sumf[row]);
        if (tiisg == 0 && first_row + row < ne01) {
-            dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
+            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
        }
    }
 }

-kernel void kernel_mul_mv_q4_0_f32(
+kernel void kernel_mul_mat_q4_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -502,12 +483,12 @@ kernel void kernel_mul_mv_q4_0_f32(
        constant   int64_t & ne1[[buffer(16)]],
        constant   uint    & gqa[[buffer(17)]],
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]],
+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
    mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
 }

-kernel void kernel_mul_mv_q4_1_f32(
+kernel void kernel_mul_mat_q4_1_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -527,7 +508,7 @@ kernel void kernel_mul_mv_q4_1_f32(

 #define NB_Q8_0 8

-kernel void kernel_mul_mv_q8_0_f32(
+kernel void kernel_mul_mat_q8_0_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -591,7 +572,7 @@ kernel void kernel_mul_mv_q8_0_f32(

 #define N_F32_F32 4

-kernel void kernel_mul_mv_f32_f32(
+kernel void kernel_mul_mat_f32_f32(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@@ -662,7 +643,7 @@ kernel void kernel_mul_mv_f32_f32(
    }
 }

-kernel void kernel_mul_mv_f16_f32_1row(
+kernel void kernel_mul_mat_f16_f32_1row(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@@ -681,7 +662,7 @@ kernel void kernel_mul_mv_f16_f32_1row(
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        uint3 tgpig[[threadgroup_position_in_grid]],
-        uint  tiisg[[thread_index_in_simdgroup]]) {
+        uint tiisg[[thread_index_in_simdgroup]]) {

    const int64_t r0 = tgpig.x;
    const int64_t r1 = tgpig.y;
@@ -716,7 +697,7 @@ kernel void kernel_mul_mv_f16_f32_1row(

 #define N_F16_F32 4

-kernel void kernel_mul_mv_f16_f32(
+kernel void kernel_mul_mat_f16_f32(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@@ -788,7 +769,7 @@ kernel void kernel_mul_mv_f16_f32(
 }

 // Assumes row size (ne00) is a multiple of 4
-kernel void kernel_mul_mv_f16_f32_l4(
+kernel void kernel_mul_mat_f16_f32_l4(
        device const  char * src0,
        device const  char * src1,
        device       float * dst,
@@ -849,9 +830,7 @@ kernel void kernel_alibi_f32(
        constant  uint64_t & nb1,
        constant  uint64_t & nb2,
        constant  uint64_t & nb3,
-        constant     float & m0,
-        constant     float & m1,
-        constant       int & n_heads_log2_floor,
+        constant      float & m0,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
@@ -867,12 +846,7 @@ kernel void kernel_alibi_f32(
    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);

    device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-    float m_k;
-    if (i2 < n_heads_log2_floor) {
-        m_k = pow(m0, i2 + 1);
-    } else {
-        m_k = pow(m1, 2 * (i2 - n_heads_log2_floor) + 1);
-    }
+    float m_k = pow(m0, i2 + 1);
    for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
@@ -1117,62 +1091,6 @@ kernel void kernel_cpy_f32_f32(
    }
 }

-kernel void kernel_concat(
-    device const char * src0,
-    device const char * src1,
-    device       char * dst,
-    constant   int64_t & ne00,
-    constant   int64_t & ne01,
-    constant   int64_t & ne02,
-    constant   int64_t & ne03,
-    constant  uint64_t & nb00,
-    constant  uint64_t & nb01,
-    constant  uint64_t & nb02,
-    constant  uint64_t & nb03,
-    constant   int64_t & ne10,
-    constant   int64_t & ne11,
-    constant   int64_t & ne12,
-    constant   int64_t & ne13,
-    constant  uint64_t & nb10,
-    constant  uint64_t & nb11,
-    constant  uint64_t & nb12,
-    constant  uint64_t & nb13,
-    constant   int64_t & ne0,
-    constant   int64_t & ne1,
-    constant   int64_t & ne2,
-    constant   int64_t & ne3,
-    constant  uint64_t & nb0,
-    constant  uint64_t & nb1,
-    constant  uint64_t & nb2,
-    constant  uint64_t & nb3,
-    uint3 tgpig[[threadgroup_position_in_grid]],
-    uint3 tpitg[[thread_position_in_threadgroup]],
-    uint3   ntg[[threads_per_threadgroup]]) {
-
-    const int64_t i03 = tgpig.z;
-    const int64_t i02 = tgpig.y;
-    const int64_t i01 = tgpig.x;
-
-    const int64_t i13 = i03 % ne13;
-    const int64_t i12 = i02 % ne12;
-    const int64_t i11 = i01 % ne11;
-
-    device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
-    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
-    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + tpitg.x*nb0;
-
-    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
-        if (i02 < ne02) {
-            ((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
-            src0_ptr += ntg.x*nb00;
-        } else {
-            ((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
-            src1_ptr += ntg.x*nb10;
-        }
-        dst_ptr += ntg.x*nb0;
-    }
-}
-
 //============================================ k-quants ======================================================

 #ifndef QK_K
@@ -1265,7 +1183,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {

 //====================================== dot products =========================

-kernel void kernel_mul_mv_q2_K_f32(
+kernel void kernel_mul_mat_q2_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1409,7 +1327,7 @@ kernel void kernel_mul_mv_q2_K_f32(
 }

 #if QK_K == 256
-kernel void kernel_mul_mv_q3_K_f32(
+kernel void kernel_mul_mat_q3_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1561,7 +1479,7 @@ kernel void kernel_mul_mv_q3_K_f32(
    }
 }
 #else
-kernel void kernel_mul_mv_q3_K_f32(
+kernel void kernel_mul_mat_q3_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1632,7 +1550,7 @@ kernel void kernel_mul_mv_q3_K_f32(
 #endif

 #if QK_K == 256
-kernel void kernel_mul_mv_q4_K_f32(
+kernel void kernel_mul_mat_q4_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1738,7 +1656,7 @@ kernel void kernel_mul_mv_q4_K_f32(
    }
 }
 #else
-kernel void kernel_mul_mv_q4_K_f32(
+kernel void kernel_mul_mat_q4_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -1827,7 +1745,7 @@ kernel void kernel_mul_mv_q4_K_f32(
 }
 #endif

-kernel void kernel_mul_mv_q5_K_f32(
+kernel void kernel_mul_mat_q5_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -2000,7 +1918,7 @@ kernel void kernel_mul_mv_q5_K_f32(

 }

-kernel void kernel_mul_mv_q6_K_f32(
+kernel void kernel_mul_mat_q6_K_f32(
        device const  void * src0,
        device const float * src1,
        device       float * dst,
@@ -2338,7 +2256,7 @@ kernel void kernel_get_rows(
 }

 #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
+#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
 #define BLOCK_SIZE_K 32
 #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
 #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
@@ -2375,11 +2293,9 @@ kernel void kernel_mul_mm(device const  uchar * src0,
    const uint r0 = tgpig.y;
    const uint r1 = tgpig.x;
    const uint im = tgpig.z;
-
    // if this block is of 64x32 shape or smaller
    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
-
    // a thread shouldn't load data outside of the matrix
    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
@@ -2403,30 +2319,26 @@ kernel void kernel_mul_mm(device const  uchar * src0,
        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));

    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
-        // load data and store to threadgroup memory
+        //load data and store to threadgroup memory
        half4x4 temp_a;
        dequantize_func(x, il, temp_a);
        threadgroup_barrier(mem_flags::mem_threadgroup);
-
        #pragma unroll(16)
        for (int i = 0; i < 16; i++) {
            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+            + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
+            + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
        }
-
-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
-
+        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
+                = *((device float2x4 *)y);
        il = (il + 2 < nl) ? il + 2 : il % 2;
        x  = (il < 2) ? x + (2+nl-1)/nl : x;
        y += BLOCK_SIZE_K;

        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        // load matrices from threadgroup memory and conduct outer products
+        //load matrices from threadgroup memory and conduct outer products
        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
-
        #pragma unroll(4)
        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
            #pragma unroll(4)
@@ -2441,7 +2353,6 @@ kernel void kernel_mul_mm(device const  uchar * src0,

            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
            #pragma unroll(8)
            for (int i = 0; i < 8; i++){
                simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
@@ -2450,26 +2361,25 @@ kernel void kernel_mul_mm(device const  uchar * src0,
    }

    if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
-        device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg &  1)) \
-                               + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
+        device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
+                          + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
        for (int i = 0; i < 8; i++) {
            simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
        }
    } else {
        // block is smaller than 64x32, we should avoid writing data outside of the matrix
        threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
+        threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
        for (int i = 0; i < 8; i++) {
            simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);
-
-        device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
-        if (sgitg == 0) {
+        device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
+        if (sgitg==0) {
            for (int i = 0; i < n_rows; i++) {
-                for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
+                for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
                    *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
                }
            }
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -19,7 +19,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#define CL_DMMV_LOCAL_SIZE 32
+#define CL_DMMV_BLOCK_SIZE 32

 #ifndef K_QUANTS_PER_ITERATION
 #define K_QUANTS_PER_ITERATION 1
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8

 __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
 {
-    const int i = get_group_id(0) + get_global_offset(0);
+    const int i = get_group_id(0);
    const int tid = get_local_id(0);
    const int n = tid / 32;
    const int l = tid - 32 * n;
    const int is = 8 * n + l / 16;

    const uint8_t q = x[i].qs[32 * n + l];
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
+    __global float *y = yy + i * QK_K + 128 * n;

    const float dall = vload_half(0, &x[i].d);
    const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
 __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
 {
    int r = get_local_id(0) / 4;
-    int i = get_group_id(0) + get_global_offset(0);
+    int i = get_group_id(0);
    int tid = r / 2;
    int is0 = r % 2;
    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
    float d_all = vload_half(0, &x[i].d);
    float dl = d_all * (us - 32);

-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
+    __global float *y = yy + i * QK_K + 128 * n + 32 * j;
    const __global uint8_t *q = x[i].qs + 32 * n;
    const __global uint8_t *hm = x[i].hmask;

@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa

 __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
 {
-    const int i = get_group_id(0) + get_global_offset(0);
+    const int i = get_group_id(0);
    const int tid = get_local_id(0);
    const int il = tid / 8;
    const int ir = tid % 8;
    const int is = 2 * il;
    const int n = 4;

-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
+    __global float *y = yy + i * QK_K + 64 * il + n * ir;

    const float dall = vload_half(0, &x[i].d);
    const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa

 __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
 {
-    const int i = get_group_id(0) + get_global_offset(0);
+    const int i = get_group_id(0);
    const int tid = get_local_id(0);
    const int il = tid / 16;
    const int ir = tid % 16;
    const int is = 2 * il;

-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
+    __global float *y = yy + i * QK_K + 64 * il + 2 * ir;

    const float dall = vload_half(0, &x[i].d);
    const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa

 __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
 {
-    const int i = get_group_id(0) + get_global_offset(0);
+    const int i = get_group_id(0);
    const int tid = get_local_id(0);
    const int ip = tid / 32;
    const int il = tid - 32 * ip;
    const int is = 8 * ip + il / 16;

-    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
+    __global float *y = yy + i * QK_K + 128 * ip + il;

    const float d = vload_half(0, &x[i].d);

@@ -338,7 +338,7 @@ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx,
    const int row = get_group_id(0);

    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+    const int ib0 = row*num_blocks_per_row;

    __global const struct block_q2_K * x = xx + ib0;

@@ -413,7 +413,7 @@ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx,
    const int row = get_group_id(0);

    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+    const int ib0 = row*num_blocks_per_row;

    __global const struct block_q3_K * x = xx + ib0;

@@ -489,7 +489,7 @@ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx,

    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+    const int ib0 = row*num_blocks_per_row;

    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
@@ -562,7 +562,7 @@ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx,

    const int row = get_group_id(0);
    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+    const int ib0 = row*num_blocks_per_row;

    const int tid = get_local_id(0)/2;  // 0...15
    const int ix  = get_local_id(0)%2;
@@ -641,7 +641,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
    const int row = get_group_id(0);

    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
+    const int ib0 = row*num_blocks_per_row;

    __global const struct block_q6_K * x = xx + ib0;

@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
    const uint qk = QUANT_K;
    const uint qr = QUANT_R;

-    const int ib = i/qk + get_global_offset(0); // block index
+    const int ib = i/qk; // block index
    const int iqs = (i%qk)/qr; // quant index
    const int iybs = i - i%qk; // y block start index
    const int y_offset = qr == 1 ? 1 : qk/2;
@@ -745,21 +745,19 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {

 std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
 __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int local_size = get_local_size(0);
+    const int block_size = get_local_size(0);
    const int row = get_group_id(0);
    const int tid = get_local_id(0);

    const uint qk = QUANT_K;
    const uint qr = QUANT_R;

-    const int col_step = local_size * 2;
    const int y_offset = qr == 1 ? 1 : qk/2;

-    x += get_global_offset(0);
-
    tmp[tid] = 0;

-    for (int col = tid*2; col < ncols; col += col_step) {
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
        const int ib = (row*ncols + col)/qk; // block index
        const int iqs = (col%qk)/qr; // quant index
        const int iybs = col - col%qk; // y block start index
@@ -775,7 +773,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float

    // sum up partial sums and write back result
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=local_size/2; s>0; s>>=1) {
+    for (int s=block_size/2; s>0; s>>=1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
@@ -1351,42 +1349,30 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
    const enum ggml_type type = src->type;
    const size_t ts = ggml_type_size(type);
    const size_t bs = ggml_blck_size(type);
-    const uint64_t row_size = ts*ne0/bs;

-    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == row_size) {
-        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
+    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
+        return err;
    }
    if (nb0 == ts) {
        const size_t buffer_origin[3] = { offset, 0, 0 };
        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { row_size, ne1, 1 };
-        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
+        const size_t region[3] = { ts*ne0/bs, ne1, 1 };
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
+        return err;
    }
-    std::vector<cl_event> events;
-    if (ev && ne1>1) events.reserve(ne1-1);
    for (uint64_t i1 = 0; i1 < ne1; i1++) {
        // pretend the row is a matrix with cols=1
-        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
+        const size_t buffer_origin[3] = { offset, i1, 0 };
        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { ts, ne0/bs, 1 };
-        // if an event is requested, make the last write wait for all previous writes to complete
-        if (ev && i1) {
-            events.push_back(*ev);
-        }
-        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
-        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
+        const size_t region[3] = { ts/bs, ne0, 1 };
+        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
        if (err != CL_SUCCESS) {
-            for (auto event : events) {
-                clReleaseEvent(event);
-            }
-            return err;
+            break;
        }
    }
-    for (auto event : events) {
-        CL_CHECK(clReleaseEvent(event));
-    }
-    return CL_SUCCESS;
+    return err;
 }

 static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1490,15 +1476,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];

    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];

-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
    const float alpha = 1.0f;
    const float beta = 0.0f;
    const int x_ne = ne01 * ne00;
@@ -1517,25 +1498,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);

-    size_t x_offset = 0;
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
-
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy data to device
-            if (src0->backend == GGML_BACKEND_GPU) {
-                x_offset = (i03 * ne02 + i02) * x_ne;
-            } else if (i02 != pi02 || i03 != pi03) {
+            if (src0->backend != GGML_BACKEND_GPU) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                pi02 = i02;
-                pi03 = i03;
            }
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));

            CL_CHECK(clFinish(queue));

@@ -1545,7 +1514,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                       ne01, ne11, ne10,
                                                       alpha,
-                                                       d_X, x_offset, ne00,
+                                                       d_X, 0, ne00,
                                                       d_Y, 0, ne10,
                                                       beta,
                                                       d_D, 0, ne01,
@@ -1556,7 +1525,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
            }

            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
        }
    }
@@ -1568,7 +1537,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    ggml_cl_pool_free(d_D, d_size);
 }

-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
+static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
    GGML_ASSERT(fp16_support);

    const int64_t ne00 = src0->ne[0];
@@ -1578,8 +1547,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];

    const int nb10 = src1->nb[0];
    const int nb11 = src1->nb[1];
@@ -1589,19 +1556,12 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];

-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
    const int x_ne = ne01 * ne00;
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;

-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
-    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
-
    size_t x_size;
    size_t y_size;
    size_t d_size;
@@ -1617,43 +1577,32 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    bool src1_cont_rows = nb10 == sizeof(float);
    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);

-    size_t x_offset = 0;
-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
-
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy src0 to device
-            if (src0->backend == GGML_BACKEND_GPU) {
-                x_offset = (i03 * ne02 + i02) * x_ne;
-            } else if (i02 != pi02 || i03 != pi03) {
+            if (src0->backend != GGML_BACKEND_GPU) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                pi02 = i02;
-                pi03 = i03;
            }

            // convert src1 to fp16
            // TODO: use multiple threads
-            char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
            if (src1_cont_rows) {
                if (src1_cont_cols) {
                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
                }
                else {
-                    for (int64_t i11 = 0; i11 < ne11; i11++) {
-                        ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
                    }
                }
            }
            else {
-                for (int64_t i11 = 0; i11 < ne11; i11++) {
-                    for (int64_t i10 = 0; i10 < ne10; i10++) {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
                        // very slow due to no inlining
-                        tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
+                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
                    }
                }
            }
@@ -1669,7 +1618,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                                                       clblast::Transpose::kYes, clblast::Transpose::kNo,
                                                       ne01, ne11, ne10,
                                                       alpha,
-                                                       d_X, x_offset, ne00,
+                                                       d_X, 0, ne00,
                                                       d_Y, 0, ne10,
                                                       beta,
                                                       d_D, 0, ne01,
@@ -1682,7 +1631,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
            // copy dst to host, then convert to float
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));

-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

            ggml_fp16_to_fp32_row(tmp, d, d_ne);
        }
@@ -1703,24 +1652,18 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];

    const int nb2  = dst->nb[2];
    const int nb3  = dst->nb[3];
    const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
+    const bool mul_mat_vec = ne11 == 1;

    const float alpha = 1.0f;
    const float beta = 0.0f;
    const int x_ne = ne01 * ne00;
    const int y_ne = ne11 * ne10;
    const int d_ne = ne11 * ne01;
-    const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
-    const size_t q_sz = ggml_type_size(type) * x_bps;
+    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);

    size_t x_size;
    size_t y_size;
@@ -1742,28 +1685,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
    GGML_ASSERT(to_fp32_cl != nullptr);

    const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
+    const size_t local = ggml_cl_local_size(type);

    size_t ev_idx = 0;
    std::vector<cl_event> events;

-    int64_t pi02 = -1;
-    int64_t pi03 = -1;
-
-    for (int64_t i13 = 0; i13 < ne13; i13++) {
-        int64_t i03 = i13 / r3;
-
-        for (int64_t i12 = 0; i12 < ne12; i12++) {
-            int64_t i02 = i12 / r2;
-
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy src0 to device if necessary
            if (src0->backend == GGML_BACKEND_CPU) {
-                if (i02 != pi02 || i03 != pi03) {
-                    events.emplace_back();
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                    pi02 = i02;
-                    pi03 = i03;
-                }
+                events.emplace_back();
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
            } else if (src0->backend == GGML_BACKEND_GPU) {
                d_Q = (cl_mem) src0->extra;
            } else {
@@ -1772,11 +1704,11 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
                // copy src1 to device
                events.emplace_back();
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));

                // compute
-                const size_t global = ne01 * local;
-                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
+                const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
+                const size_t local = CL_DMMV_BLOCK_SIZE;
                const cl_int ncols = ne00;
                events.emplace_back();
                CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
@@ -1784,17 +1716,16 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
                CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
                CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, NULL, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
            } else { // general dequantization kernel + CLBlast matrix matrix multiplication
                // convert src0 to fp32 on device
                const size_t global = x_ne / global_denom;
-                const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
                CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
                CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
+                CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));

                // copy src1 to device
-                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));

                events.emplace_back();

@@ -1818,7 +1749,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
            }

            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
            for (auto *event : events) {
                clReleaseEvent(event);
@@ -1900,8 +1831,8 @@ void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
 }

 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
+    if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
    }
    return 0;
 }
@@ -1913,19 +1844,17 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    const int64_t ne3 = tensor->ne[3];

    const ggml_type type = tensor->type;
-    const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
-    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);

    size_t q_size;
    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);

    tensor->data = data;
    // copy tensor to device
-    size_t offset = 0;
    for (int64_t i3 = 0; i3 < ne3; i3++) {
        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
-            offset += s_sz;
+            int i = i3*ne2 + i2;
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
        }
    }

--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -214,8 +214,8 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this

 #define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         16384
-#define GGML_MAX_PARAMS        1024
+#define GGML_MAX_NODES         4096
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
@@ -248,14 +248,6 @@
        } \
    } while (0)

-#ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
-#elif defined(__GNUC__)
-#define GGML_UNREACHABLE() __builtin_unreachable()
-#else
-#define GGML_UNREACHABLE() ((void) 0)
-#endif
-
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -326,7 +318,7 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

-    enum ggml_backend_type {
+    enum ggml_backend {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
        GGML_BACKEND_GPU_SPLIT = 20,
@@ -401,14 +393,10 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
-        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,

-        GGML_OP_CONV_1D_STAGE_0,  // internal
-        GGML_OP_CONV_1D_STAGE_1,  // internal
-
        GGML_OP_UPSCALE, // nearest interpolate

        GGML_OP_FLASH_ATTN,
@@ -479,16 +467,14 @@ extern "C" {

    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type         type;
-        enum ggml_backend_type backend;
-
-        struct ggml_backend_buffer * buffer;
+        enum ggml_type    type;
+        enum ggml_backend backend;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = ggml_type_size(type)
-                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
+                                   // nb[0] = sizeof(type)
+                                   // nb[1] = nb[0]   * ne[0] + padding
                                   // nb[i] = nb[i-1] * ne[i-1]

        // compute data
@@ -516,7 +502,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[12];
+        char padding[4];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -540,15 +526,7 @@ extern "C" {
    // next prime after GGML_MAX_NODES
    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
-    // #define GGML_GRAPH_HASHTABLE_SIZE 8273
-    // #define GGML_GRAPH_HASHTABLE_SIZE 16411
-    #define GGML_GRAPH_HASHTABLE_SIZE 32771
-
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
+    #define GGML_GRAPH_HASHTABLE_SIZE 8273

    // computation graph
    struct ggml_cgraph {
@@ -561,8 +539,6 @@ extern "C" {

        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];

-        enum ggml_cgraph_eval_order order;
-
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@@ -704,30 +680,18 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);

-    // Context tensor enumeration and lookup
-    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
-    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);

-    // Converts a flat index into coordinates
-    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
-
    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);

-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);

-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
-
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

@@ -761,12 +725,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    GGML_API struct ggml_tensor * ggml_add_cast(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            enum   ggml_type      type);
-
    GGML_API struct ggml_tensor * ggml_add1(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -876,7 +834,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // sums repetitions in a into shape of b
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1363,7 +1320,7 @@ extern "C" {

    // alibi position embedding
    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
+    struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
@@ -1372,7 +1329,7 @@ extern "C" {

    // clamp
    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_clamp(
+    struct ggml_tensor * ggml_clamp(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            float                 min,
@@ -1395,14 +1352,6 @@ extern "C" {
            int                   s,
            int                   d);

-    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
-
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1740,16 +1689,6 @@ extern "C" {
    // dump the graph into a file using the dot format
    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);

-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes,
-    // but without the second forward pass nodes.
-    GGML_API void ggml_build_backward_gradient_checkpointing(
-            struct ggml_context   * ctx,
-            struct ggml_cgraph    * gf,
-            struct ggml_cgraph    * gb,
-            struct ggml_cgraph    * gb_tmp,
-            struct ggml_tensor  * * checkpoints,
-            int                     n_checkpoints);
    //
    // optimization
    //
@@ -1776,7 +1715,6 @@ extern "C" {
        GGML_OPT_NO_CONTEXT,
        GGML_OPT_INVALID_WOLFE,
        GGML_OPT_FAIL,
-        GGML_OPT_CANCEL,

        GGML_LINESEARCH_FAIL = -128,
        GGML_LINESEARCH_MINIMUM_STEP,
@@ -1785,7 +1723,7 @@ extern "C" {
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };

-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*ggml_opt_callback)(void * data, float * sched);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);

    // optimization parameters
@@ -1817,8 +1755,6 @@ extern "C" {
        bool print_forward_graph;
        bool print_backward_graph;

-        int n_gradient_accumulation;
-
        // ADAM parameters
        struct {
            int n_iter;
@@ -1864,7 +1800,6 @@ extern "C" {
        float loss_after;

        struct {
-            struct ggml_tensor * g;  // current gradient
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * pf; // past function values
@@ -1981,26 +1916,26 @@ extern "C" {

    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);

-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);

-    // will abort if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
+    // results are undefined if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int i);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int i);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int i);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
@@ -2107,7 +2042,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;

-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -69,3 +69,4 @@ python -m twine upload dist/*
 ## TODO
 - [ ] Add tests
 - [ ] Include conversion scripts as command line entry points in this package.
+- Add CI workflow for releasing the package.
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -85,34 +85,26 @@ class MODEL_ARCH(IntEnum):
    GPTNEOX       : int = auto()
    MPT           : int = auto()
    STARCODER     : int = auto()
-    PERSIMMON     : int = auto()
-    REFACT        : int = auto()
-    BERT          : int = auto()
-    BLOOM         : int = auto()


 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD      : int = auto()
-    TOKEN_EMBD_NORM : int = auto()
-    TOKEN_TYPES     : int = auto()
-    POS_EMBD        : int = auto()
-    OUTPUT          : int = auto()
-    OUTPUT_NORM     : int = auto()
-    ROPE_FREQS      : int = auto()
-    ATTN_Q          : int = auto()
-    ATTN_K          : int = auto()
-    ATTN_V          : int = auto()
-    ATTN_QKV        : int = auto()
-    ATTN_OUT        : int = auto()
-    ATTN_NORM       : int = auto()
-    ATTN_NORM_2     : int = auto()
-    ATTN_ROT_EMBD   : int = auto()
-    FFN_GATE        : int = auto()
-    FFN_DOWN        : int = auto()
-    FFN_UP          : int = auto()
-    FFN_NORM        : int = auto()
-    ATTN_Q_NORM     : int = auto()
-    ATTN_K_NORM     : int = auto()
+    TOKEN_EMBD   : int = auto()
+    POS_EMBD     : int = auto()
+    OUTPUT       : int = auto()
+    OUTPUT_NORM  : int = auto()
+    ROPE_FREQS   : int = auto()
+    ATTN_Q       : int = auto()
+    ATTN_K       : int = auto()
+    ATTN_V       : int = auto()
+    ATTN_QKV     : int = auto()
+    ATTN_OUT     : int = auto()
+    ATTN_NORM    : int = auto()
+    ATTN_NORM_2  : int = auto()
+    ATTN_ROT_EMBD: int = auto()
+    FFN_GATE     : int = auto()
+    FFN_DOWN     : int = auto()
+    FFN_UP       : int = auto()
+    FFN_NORM     : int = auto()


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -124,183 +116,78 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.GPTNEOX:        "gptneox",
    MODEL_ARCH.MPT:            "mpt",
    MODEL_ARCH.STARCODER:      "starcoder",
-    MODEL_ARCH.PERSIMMON:      "persimmon",
-    MODEL_ARCH.REFACT:         "refact",
-    MODEL_ARCH.BERT:           "bert",
-    MODEL_ARCH.BLOOM:          "bloom",
 }

-TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
-    MODEL_TENSOR.TOKEN_EMBD:      "token_embd",
-    MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
-    MODEL_TENSOR.TOKEN_TYPES:     "token_types",
-    MODEL_TENSOR.POS_EMBD:        "position_embd",
-    MODEL_TENSOR.OUTPUT_NORM:     "output_norm",
-    MODEL_TENSOR.OUTPUT:          "output",
-    MODEL_TENSOR.ROPE_FREQS:      "rope_freqs",
-    MODEL_TENSOR.ATTN_NORM:       "blk.{bid}.attn_norm",
-    MODEL_TENSOR.ATTN_NORM_2:     "blk.{bid}.attn_norm_2",
-    MODEL_TENSOR.ATTN_QKV:        "blk.{bid}.attn_qkv",
-    MODEL_TENSOR.ATTN_Q:          "blk.{bid}.attn_q",
-    MODEL_TENSOR.ATTN_K:          "blk.{bid}.attn_k",
-    MODEL_TENSOR.ATTN_V:          "blk.{bid}.attn_v",
-    MODEL_TENSOR.ATTN_OUT:        "blk.{bid}.attn_output",
-    MODEL_TENSOR.ATTN_ROT_EMBD:   "blk.{bid}.attn_rot_embd",
-    MODEL_TENSOR.ATTN_Q_NORM:     "blk.{bid}.attn_q_norm",
-    MODEL_TENSOR.ATTN_K_NORM:     "blk.{bid}.attn_k_norm",
-    MODEL_TENSOR.FFN_NORM:        "blk.{bid}.ffn_norm",
-    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
-    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
-    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
-}
-
-MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
-    MODEL_ARCH.LLAMA: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ROPE_FREQS,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.GPTNEOX: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.FALCON: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_NORM_2,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.BAICHUAN: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ROPE_FREQS,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.STARCODER: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.POS_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.BERT: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.TOKEN_TYPES,
-        MODEL_TENSOR.POS_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.MPT: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.GPTJ: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.PERSIMMON: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.ATTN_Q_NORM,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-    ],
-    MODEL_ARCH.REFACT: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_Q,
-        MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.BLOOM: [
-        MODEL_TENSOR.TOKEN_EMBD,
-        MODEL_TENSOR.TOKEN_EMBD_NORM,
-        MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ATTN_NORM,
-        MODEL_TENSOR.ATTN_QKV,
-        MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
-    ],
-    MODEL_ARCH.GPT2: [
+MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
+    MODEL_ARCH.LLAMA: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPTNEOX: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.FALCON: {
+        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+        MODEL_TENSOR.OUTPUT:      "output",
+        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.BAICHUAN: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.STARCODER: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.POS_EMBD:      "position_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPT2: {
        # TODO
-    ],
+    },
    # TODO
 }

@@ -314,9 +201,6 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
-    MODEL_ARCH.PERSIMMON: [
-        MODEL_TENSOR.ROPE_FREQS,
-    ]
 }


@@ -324,50 +208,31 @@ class TensorNameMap:
    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Token embeddings
        MODEL_TENSOR.TOKEN_EMBD: (
-            "gpt_neox.embed_in",                        # gptneox
-            "transformer.wte",                          # gpt2 gpt-j mpt refact
-            "transformer.word_embeddings",              # falcon
-            "word_embeddings",                          # bloom
-            "model.embed_tokens",                       # llama-hf
-            "tok_embeddings",                           # llama-pth
-            "embeddings.word_embeddings",               # bert
-            "language_model.embedding.word_embeddings", # persimmon
-        ),
-
-        # Token type embeddings
-        MODEL_TENSOR.TOKEN_TYPES: (
-            "embeddings.token_type_embeddings",  # bert
-        ),
-
-        # Normalization of token embeddings
-        MODEL_TENSOR.TOKEN_EMBD_NORM: (
-            "word_embeddings_layernorm",  # bloom
+            "gpt_neox.embed_in",           # gptneox
+            "transformer.wte",             # gpt2 mpt
+            "transformer.word_embeddings", # falcon
+            "model.embed_tokens",          # llama-hf
+            "tok_embeddings",              # llama-pth
        ),

        # Position embeddings
        MODEL_TENSOR.POS_EMBD: (
-            "transformer.wpe",                 # gpt2
-            "embeddings.position_embeddings",  # bert
+            "transformer.wpe", # gpt2
        ),

        # Output
        MODEL_TENSOR.OUTPUT: (
-            "embed_out",                # gptneox
-            "lm_head",                  # gpt2 mpt falcon llama-hf baichuan
-            "output",                   # llama-pth bloom
-            "word_embeddings_for_head", # persimmon
+            "embed_out", # gptneox
+            "lm_head",   # gpt2 mpt falcon llama-hf baichuan
+            "output",    # llama-pth
        ),

        # Output norm
        MODEL_TENSOR.OUTPUT_NORM: (
-            "gpt_neox.final_layer_norm",              # gptneox
-            "transformer.ln_f",                       # gpt2 gpt-j falcon
-            "model.norm",                             # llama-hf baichuan
-            "norm",                                   # llama-pth
-            "embeddings.LayerNorm",                   # bert
-            "transformer.norm_f",                     # mpt
-            "ln_f",                                   # refact bloom
-            "language_model.encoder.final_layernorm", # persimmon
+            "gpt_neox.final_layer_norm", # gptneox
+            "transformer.ln_f",          # gpt2 falcon
+            "model.norm",                # llama-hf baichuan
+            "norm",                      # llama-pth
        ),

        # Rope frequencies
@@ -379,16 +244,13 @@ class TensorNameMap:
    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
        # Attention norm
        MODEL_TENSOR.ATTN_NORM: (
-            "gpt_neox.layers.{bid}.input_layernorm",               # gptneox
-            "transformer.h.{bid}.ln_1",                            # gpt2 gpt-j refact
-            "transformer.blocks.{bid}.norm_1",                     # mpt
-            "transformer.h.{bid}.input_layernorm",                 # falcon7b
-            "h.{bid}.input_layernorm",                             # bloom
-            "transformer.h.{bid}.ln_mlp",                          # falcon40b
-            "model.layers.{bid}.input_layernorm",                  # llama-hf
-            "layers.{bid}.attention_norm",                         # llama-pth
-            "encoder.layer.{bid}.attention.output.LayerNorm",      # bert
-            "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
+            "gpt_neox.layers.{bid}.input_layernorm", # gptneox
+            "transformer.h.{bid}.ln_1",              # gpt2
+            "transformer.blocks.{bid}.norm_1",       # mpt
+            "transformer.h.{bid}.input_layernorm",   # falcon7b
+            "transformer.h.{bid}.ln_mlp",            # falcon40b
+            "model.layers.{bid}.input_layernorm",    # llama-hf
+            "layers.{bid}.attention_norm",           # llama-pth
        ),

        # Attention norm 2
@@ -398,50 +260,38 @@ class TensorNameMap:

        # Attention query-key-value
        MODEL_TENSOR.ATTN_QKV: (
-            "gpt_neox.layers.{bid}.attention.query_key_value",                    # gptneox
-            "transformer.h.{bid}.attn.c_attn",                                    # gpt2
-            "transformer.blocks.{bid}.attn.Wqkv",                                 # mpt
-            "transformer.h.{bid}.self_attention.query_key_value",                 # falcon
-            "h.{bid}.self_attention.query_key_value",                             # bloom
-            "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
+            "gpt_neox.layers.{bid}.attention.query_key_value",    # gptneox
+            "transformer.h.{bid}.attn.c_attn",                    # gpt2
+            "transformer.blocks.{bid}.attn.Wqkv",                 # mpt
+            "transformer.h.{bid}.self_attention.query_key_value", # falcon
        ),

        # Attention query
        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",       # llama-hf
-            "layers.{bid}.attention.wq",                 # llama-pth
-            "encoder.layer.{bid}.attention.self.query",  # bert
-            "transformer.h.{bid}.attn.q_proj",           # gpt-j
+            "model.layers.{bid}.self_attn.q_proj", # llama-hf
+            "layers.{bid}.attention.wq",           # llama-pth
        ),

        # Attention key
        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",     # llama-hf
-            "layers.{bid}.attention.wk",               # llama-pth
-            "encoder.layer.{bid}.attention.self.key",  # bert
-            "transformer.h.{bid}.attn.k_proj",         # gpt-j
+            "model.layers.{bid}.self_attn.k_proj", # llama-hf
+            "layers.{bid}.attention.wk",           # llama-pth
        ),

        # Attention value
        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",       # llama-hf
-            "layers.{bid}.attention.wv",                 # llama-pth
-            "encoder.layer.{bid}.attention.self.value",  # bert
-            "transformer.h.{bid}.attn.v_proj",           # gpt-j
+            "model.layers.{bid}.self_attn.v_proj", # llama-hf
+            "layers.{bid}.attention.wv",           # llama-pth
        ),

        # Attention output
        MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",                   # gptneox
-            "transformer.h.{bid}.attn.c_proj",                         # gpt2 refact
-            "transformer.blocks.{bid}.attn.out_proj",                  # mpt
-            "transformer.h.{bid}.self_attention.dense",                # falcon
-            "h.{bid}.self_attention.dense",                            # bloom
-            "model.layers.{bid}.self_attn.o_proj",                     # llama-hf
-            "layers.{bid}.attention.wo",                               # llama-pth
-            "encoder.layer.{bid}.attention.output.dense",              # bert
-            "transformer.h.{bid}.attn.out_proj",                       # gpt-j
-            "language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
+            "gpt_neox.layers.{bid}.attention.dense",    # gptneox
+            "transformer.h.{bid}.attn.c_proj",          # gpt2
+            "transformer.blocks.{bid}.attn.out_proj",   # mpt
+            "transformer.h.{bid}.self_attention.dense", # falcon
+            "model.layers.{bid}.self_attn.o_proj",      # llama-hf
+            "layers.{bid}.attention.wo",                # llama-pth
        ),

        # Rotary embeddings
@@ -452,83 +302,64 @@ class TensorNameMap:

        # Feed-forward norm
        MODEL_TENSOR.FFN_NORM: (
-            "gpt_neox.layers.{bid}.post_attention_layernorm",               # gptneox
-            "transformer.h.{bid}.ln_2",                                     # gpt2 refact
-            "h.{bid}.post_attention_layernorm",                             # bloom
-            "transformer.blocks.{bid}.norm_2",                              # mpt
-            "model.layers.{bid}.post_attention_layernorm",                  # llama-hf
-            "layers.{bid}.ffn_norm",                                        # llama-pth
-            "encoder.layer.{bid}.output.LayerNorm",                         # bert
-            "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
+            "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
+            "transformer.h.{bid}.ln_2",                       # gpt2
+            "transformer.blocks.{bid}.norm_2",                # mpt
+            "model.layers.{bid}.post_attention_layernorm",    # llama-hf
+            "layers.{bid}.ffn_norm",                          # llama-pth
        ),

        # Feed-forward up
        MODEL_TENSOR.FFN_UP: (
-            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",               # gptneox
-            "transformer.h.{bid}.mlp.c_fc",                          # gpt2
-            "transformer.blocks.{bid}.ffn.up_proj",                  # mpt
-            "transformer.h.{bid}.mlp.dense_h_to_4h",                 # falcon
-            "h.{bid}.mlp.dense_h_to_4h",                             # bloom
-            "model.layers.{bid}.mlp.up_proj",                        # llama-hf refact
-            "layers.{bid}.feed_forward.w3",                          # llama-pth
-            "encoder.layer.{bid}.intermediate.dense",                # bert
-            "transformer.h.{bid}.mlp.fc_in",                         # gpt-j
-            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
+            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
+            "transformer.h.{bid}.mlp.c_fc",            # gpt2
+            "transformer.blocks.{bid}.ffn.up_proj",    # mpt
+            "transformer.h.{bid}.mlp.dense_h_to_4h",   # falcon
+            "model.layers.{bid}.mlp.up_proj",          # llama-hf
+            "layers.{bid}.feed_forward.w3",            # llama-pth
        ),

        # Feed-forward gate
        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
+            "model.layers.{bid}.mlp.gate_proj", # llama-hf
            "layers.{bid}.feed_forward.w1",     # llama-pth
        ),

        # Feed-forward down
        MODEL_TENSOR.FFN_DOWN: (
-            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",               # gptneox
-            "transformer.h.{bid}.mlp.c_proj",                        # gpt2 refact
-            "transformer.blocks.{bid}.ffn.down_proj",                # mpt
-            "transformer.h.{bid}.mlp.dense_4h_to_h",                 # falcon
-            "h.{bid}.mlp.dense_4h_to_h",                             # bloom
-            "model.layers.{bid}.mlp.down_proj",                      # llama-hf
-            "layers.{bid}.feed_forward.w2",                          # llama-pth
-            "encoder.layer.{bid}.output.dense",                      # bert
-            "transformer.h.{bid}.mlp.fc_out",                        # gpt-j
-            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
+            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
+            "transformer.h.{bid}.mlp.c_proj",          # gpt2
+            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
+            "transformer.h.{bid}.mlp.dense_4h_to_h",   # falcon
+            "model.layers.{bid}.mlp.down_proj",        # llama-hf
+            "layers.{bid}.feed_forward.w2",            # llama-pth
        ),
-
-        MODEL_TENSOR.ATTN_Q_NORM: (
-            "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
-        ),
-
-        MODEL_TENSOR.ATTN_K_NORM: (
-            "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
-        ),
-
-        MODEL_TENSOR.ROPE_FREQS: (
-            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
-        )
    }

    mapping: dict[str, tuple[MODEL_TENSOR, str]]

+    tensor_names: dict[MODEL_TENSOR, str]
+
    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
-        self.mapping = {}
+        mapping = self.mapping = {}
+        tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
        for tensor, keys in self.mappings_cfg.items():
-            if tensor not in MODEL_TENSORS[arch]:
+            tensor_name = tensor_names.get(tensor)
+            if tensor_name is None:
                continue
-            tensor_name = TENSOR_NAMES[tensor]
-            self.mapping[tensor_name] = (tensor, tensor_name)
+            mapping[tensor_name] = (tensor, tensor_name)
            for key in keys:
-                self.mapping[key] = (tensor, tensor_name)
+                mapping[key] = (tensor, tensor_name)
        for bid in range(n_blocks):
            for tensor, keys in self.block_mappings_cfg.items():
-                if tensor not in MODEL_TENSORS[arch]:
+                tensor_name = tensor_names.get(tensor)
+                if tensor_name is None:
                    continue
-                tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
-                self.mapping[tensor_name] = (tensor, tensor_name)
+                tensor_name = tensor_name.format(bid = bid)
+                mapping[tensor_name] = (tensor, tensor_name)
                for key in keys:
                    key = key.format(bid = bid)
-                    self.mapping[key] = (tensor, tensor_name)
+                    mapping[key] = (tensor, tensor_name)

    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
        result = self.mapping.get(key)
@@ -969,25 +800,22 @@ class SpecialVocab:
    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
    special_token_ids: dict[str, int] = {}

-    def __init__(
-        self, path: str | os.PathLike[str], load_merges: bool = False,
-        special_token_types: tuple[str, ...] | None = None,
-    ):
+    def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
        self.special_token_ids = {}
        self.load_merges = load_merges
        if special_token_types is not None:
            self.special_token_types = special_token_types
-        self._load(Path(path))
+        self.load(path)

-    def _load(self, path: Path) -> None:
-        if not self._try_load_from_tokenizer_json(path):
-            self._try_load_from_config_json(path)
+    def load(self, path: Path):
+        if not self.try_load_from_tokenizer_json(path):
+            self.try_load_from_config_json(path)

-    def _try_load_from_tokenizer_json(self, path: Path) -> bool:
+    def try_load_from_tokenizer_json(self, path: Path) -> bool:
        tokenizer_file = path / 'tokenizer.json'
        if not tokenizer_file.is_file():
            return False
-        with open(tokenizer_file, encoding = 'utf-8') as f:
+        with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
            tokenizer = json.load(f)
        if self.load_merges:
            merges = tokenizer.get('model', {}).get('merges')
@@ -997,7 +825,7 @@ class SpecialVocab:
        added_tokens = tokenizer.get('added_tokens')
        if added_tokens is None or not tokenizer_config_file.is_file():
            return True
-        with open(tokenizer_config_file, encoding = 'utf-8') as f:
+        with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
            tokenizer_config = json.load(f)
        for typ in self.special_token_types:
            entry = tokenizer_config.get(f'{typ}_token')
@@ -1016,11 +844,11 @@ class SpecialVocab:
                break
        return True

-    def _try_load_from_config_json(self, path: Path) -> bool:
+    def try_load_from_config_json(self, path: Path) -> bool:
        config_file = path / 'config.json'
        if not config_file.is_file():
            return False
-        with open(config_file, encoding = 'utf-8') as f:
+        with open(config_file, 'r', encoding = 'utf-8') as f:
            config = json.load(f)
        for typ in self.special_token_types:
            maybe_token_id = config.get(f'{typ}_token_id')
@@ -1028,7 +856,7 @@ class SpecialVocab:
                self.special_token_ids[typ] = maybe_token_id
        return True

-    def add_to_gguf(self, gw: GGUFWriter) -> None:
+    def add_to_gguf(self, gw: GGUFWriter):
        if len(self.merges) > 0:
            print(f'gguf: Adding {len(self.merges)} merge(s).')
            gw.add_token_merges(self.merges)
@@ -1040,8 +868,8 @@ class SpecialVocab:
            print(f'gguf: Setting special token type {typ} to {tokid}')
            handler(tokid)

-    def __repr__(self) -> str:
-        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
+    def __repr__(self):
+        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'


 # Example usage:
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.4"
+version = "0.3.3"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	c5650ed470	server : avoid context swaps by shifting the KV cache	2023-09-28 19:03:36 +03:00
Georgi Gerganov	ce2d995af2	server : clear the KV cache beyond n_past before llama_decode	2023-09-28 18:12:39 +03:00
Georgi Gerganov	2b8830af71	examples : do not eval prompt 2 times (close #3348 )	2023-09-28 17:48:46 +03:00
Georgi Gerganov	a207561503	examples : add example for batched decoding	2023-09-28 17:32:04 +03:00
Georgi Gerganov	d008733e6b	examples : utilize new llama_get_logits_ith()	2023-09-28 16:05:37 +03:00
Georgi Gerganov	4c72ab13b2	metal : use mm kernels for batch size > 2	2023-09-28 16:02:20 +03:00
Georgi Gerganov	e9463792d3	llama : simplify returns if/else branches	2023-09-28 16:01:49 +03:00
Georgi Gerganov	4ad0676927	parallel : fix crash when `-n -1`	2023-09-28 15:48:38 +03:00
Georgi Gerganov	25856900db	Merge branch 'master' into custom-attention-mask	2023-09-28 15:19:57 +03:00
Georgi Gerganov	c1596f633f	llama : fix kv cache heuristic when context is less than 32	2023-09-27 18:12:43 +03:00
Georgi Gerganov	8845160058	simple : add README.md	2023-09-21 20:10:14 +02:00
Georgi Gerganov	5a3369d8e8	llama : llama.h formatting + comments	2023-09-21 19:51:32 +02:00
Georgi Gerganov	b2debf65f2	parallel : add disabled experimental batch chunking in powers of two	2023-09-20 20:14:05 +03:00
Georgi Gerganov	ded9b43cad	parallel : fix cases where the input prompts can overflow the batch	2023-09-20 19:09:25 +03:00
Georgi Gerganov	ee1d670cc6	parallel : fix bug (extra BOS) + smaller token_prev array	2023-09-20 17:32:21 +03:00
slaren	1be2b8c19b	ggml : revert change to ggml_cpy, add ggml_cont_Nd instead (#3275 ) ggml-ci	2023-09-20 16:12:51 +03:00
Georgi Gerganov	2f3a46fccf	train : make KQ_pos memory buffer permanent via dummy scale op	2023-09-20 14:14:50 +03:00
Georgi Gerganov	54206962c7	llama : disable MPI for now ggml-ci	2023-09-20 14:07:29 +03:00
slaren	e04dc51988	ggml-cuda : add rope f16, restore performance with parallel decoding (#3272 ) * ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-09-20 14:00:28 +03:00
Georgi Gerganov	db0fc2da06	simple : improve comments + free batch	2023-09-20 13:54:20 +03:00
Georgi Gerganov	b377bf2266	simple : add parallel decoding support	2023-09-20 13:06:34 +03:00
Georgi Gerganov	addae65fd4	llama : improve llama_batch API + simplify parallel example	2023-09-20 11:03:18 +03:00
Georgi Gerganov	a1327c71c6	parallel : rename hot-plug to continuous-batching	2023-09-20 09:24:41 +03:00
Georgi Gerganov	e1067efbfa	llama : fix n_kv to never become 0	2023-09-20 09:17:05 +03:00
Georgi Gerganov	7b7472ee26	parallel : minor	2023-09-20 00:35:10 +03:00
Georgi Gerganov	6028879f56	parallel : print misses on each request	2023-09-19 23:50:05 +03:00
Georgi Gerganov	eed3fd4234	parallel : count cache misses	2023-09-19 23:47:47 +03:00
Georgi Gerganov	8a9aca37c1	parallel : remove question with short answers	2023-09-19 23:34:30 +03:00
Georgi Gerganov	4b5f3cd6bf	parallel : process system prompt once + configurable paramters + llama API	2023-09-19 17:00:42 +03:00
Georgi Gerganov	82e20e9ba0	parallel : remove new line from prompt	2023-09-19 13:54:41 +03:00
Georgi Gerganov	d37081ae5d	llama : silence errors KV cache errors	2023-09-19 13:42:59 +03:00
Georgi Gerganov	16090a5dde	parallel : fix sequence termination criteria	2023-09-19 13:29:29 +03:00
Georgi Gerganov	806d397c1a	parallel : try smaller batches when the KV cache is fragmented	2023-09-19 13:21:36 +03:00
Georgi Gerganov	ddad227782	llama : fix cell_max logic + rename functions	2023-09-19 13:21:12 +03:00
Georgi Gerganov	36714e16d0	parallel : various improvements	2023-09-19 12:29:37 +03:00
Georgi Gerganov	467e307931	simple : fix token counting	2023-09-19 11:45:33 +03:00
Georgi Gerganov	25bd254089	make : add parallel to build + fix static functions in llama.cpp	2023-09-19 11:37:02 +03:00
slaren	7e2b9974d1	ggml-cuda : update rope implementation for parallel decoding (#3254 ) * ggml-cuda : update rope implementation for parallel decoding * better solution for p0 computation * fix rope * simpler rope implementation --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-09-19 11:31:36 +03:00
Georgi Gerganov	daf4c6d360	llama : fix worst case graph build	2023-09-19 11:05:08 +03:00
Georgi Gerganov	fa0e677820	llama : extend batch API to select which logits to output	2023-09-19 00:24:13 +03:00
Georgi Gerganov	897caccdf4	fixes : speculative KV cache + llama worst-case graph	2023-09-18 22:32:28 +03:00
Georgi Gerganov	466b513851	parallel : disable hot-plug to avoid cache fragmentation	2023-09-18 21:34:20 +03:00
Georgi Gerganov	0161372b9a	parallel : example for serving multiple users in parallel	2023-09-18 20:37:28 +03:00
Georgi Gerganov	1f17ea631c	speculative : fix KV cache management	2023-09-18 19:01:20 +03:00
Georgi Gerganov	7c1bdd0e8a	llama : apply K-cache roping for Falcon and Baichuan	2023-09-18 18:26:05 +03:00
Georgi Gerganov	0cbf3bfef8	llama : add llama_kv_cache_shift_seq + no more context swaps	2023-09-18 18:10:43 +03:00
Georgi Gerganov	86c90e34f5	metal : disable concurrency optimization	2023-09-18 18:00:01 +03:00
Georgi Gerganov	f015b26689	llama : more robust cell_max heuristic + wip shift	2023-09-18 17:15:58 +03:00
Georgi Gerganov	4d76d762ef	llama : extend llama_kv_cache API	2023-09-18 15:53:03 +03:00
Georgi Gerganov	6952a460b9	llama : add cell_max heuristic for more efficient kv_cache	2023-09-18 15:31:24 +03:00
Georgi Gerganov	9f42e75489	llama : add new llama_decode() API that works with llama_batch	2023-09-18 14:23:52 +03:00
Georgi Gerganov	58bb5110ca	Merge branch 'master' into custom-attention-mask	2023-09-18 11:15:18 +03:00
Georgi Gerganov	d29e76937c	llama : unified KV cache + batch inference API	2023-09-18 11:08:15 +03:00
Georgi Gerganov	fad56936d4	metal : add rope_f16 kernel + optimize cpy kernels	2023-09-17 23:39:45 +03:00
Georgi Gerganov	1fb033fd85	ggml : ggml_rope now takes a vector with positions instead of n_past	2023-09-17 21:17:10 +03:00
Georgi Gerganov	3b4bab6a38	llama : replace ggml_diag_mask_inf with ggml_add (custom -inf mask)	2023-09-17 19:42:39 +03:00
Georgi Gerganov	c5df72e848	tests : verify that RoPE is "additive"	2023-09-17 17:55:12 +03:00